{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09259259259259259, "grad_norm": 2.028742551803589, "learning_rate": 1.6666666666666667e-05, "loss": 1.0574, "step": 10 }, { "epoch": 0.18518518518518517, "grad_norm": 3.1797935962677, "learning_rate": 3.518518518518519e-05, "loss": 0.3119, "step": 20 }, { "epoch": 0.2777777777777778, "grad_norm": 1.121282696723938, "learning_rate": 5.370370370370371e-05, "loss": 0.2573, "step": 30 }, { "epoch": 0.37037037037037035, "grad_norm": 1.758172869682312, "learning_rate": 7.222222222222222e-05, "loss": 0.2499, "step": 40 }, { "epoch": 0.46296296296296297, "grad_norm": 1.6619542837142944, "learning_rate": 9.074074074074075e-05, "loss": 0.1927, "step": 50 }, { "epoch": 0.5555555555555556, "grad_norm": 1.7146813869476318, "learning_rate": 9.99941402841295e-05, "loss": 0.1834, "step": 60 }, { "epoch": 0.6481481481481481, "grad_norm": 1.0112364292144775, "learning_rate": 9.994727079754844e-05, "loss": 0.1769, "step": 70 }, { "epoch": 0.7407407407407407, "grad_norm": 1.1895333528518677, "learning_rate": 9.985357576451127e-05, "loss": 0.1595, "step": 80 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6763359904289246, "learning_rate": 9.971314302407412e-05, "loss": 0.1326, "step": 90 }, { "epoch": 0.9259259259259259, "grad_norm": 1.2080364227294922, "learning_rate": 9.952610423187516e-05, "loss": 0.1305, "step": 100 }, { "epoch": 1.0185185185185186, "grad_norm": 1.3985177278518677, "learning_rate": 9.929263473670749e-05, "loss": 0.1329, "step": 110 }, { "epoch": 1.1111111111111112, "grad_norm": 0.8234744668006897, "learning_rate": 9.901295341612973e-05, "loss": 0.1282, "step": 120 }, { "epoch": 1.2037037037037037, "grad_norm": 0.7556248307228088, "learning_rate": 9.86873224712684e-05, "loss": 0.1128, "step": 130 }, { "epoch": 1.2962962962962963, "grad_norm": 1.1482961177825928, "learning_rate": 9.831604718100444e-05, "loss": 0.1102, "step": 140 }, { "epoch": 1.3888888888888888, "grad_norm": 0.9877162575721741, "learning_rate": 9.789947561577445e-05, "loss": 0.1008, "step": 150 }, { "epoch": 1.4814814814814814, "grad_norm": 0.8874509930610657, "learning_rate": 9.743799831125472e-05, "loss": 0.0917, "step": 160 }, { "epoch": 1.574074074074074, "grad_norm": 1.0449544191360474, "learning_rate": 9.693204790223422e-05, "loss": 0.0847, "step": 170 }, { "epoch": 1.6666666666666665, "grad_norm": 1.273176670074463, "learning_rate": 9.638209871701967e-05, "loss": 0.0861, "step": 180 }, { "epoch": 1.7592592592592593, "grad_norm": 0.42473676800727844, "learning_rate": 9.578866633275288e-05, "loss": 0.0984, "step": 190 }, { "epoch": 1.8518518518518519, "grad_norm": 1.792026400566101, "learning_rate": 9.515230709205748e-05, "loss": 0.0881, "step": 200 }, { "epoch": 1.9444444444444444, "grad_norm": 1.4421114921569824, "learning_rate": 9.447361758146791e-05, "loss": 0.0854, "step": 210 }, { "epoch": 2.037037037037037, "grad_norm": 0.9304073452949524, "learning_rate": 9.375323407212969e-05, "loss": 0.0693, "step": 220 }, { "epoch": 2.1296296296296298, "grad_norm": 0.8037934303283691, "learning_rate": 9.299183192329556e-05, "loss": 0.0721, "step": 230 }, { "epoch": 2.2222222222222223, "grad_norm": 1.0774219036102295, "learning_rate": 9.219012494917645e-05, "loss": 0.0779, "step": 240 }, { "epoch": 2.314814814814815, "grad_norm": 0.8292256593704224, "learning_rate": 9.134886474974091e-05, "loss": 0.0666, "step": 250 }, { "epoch": 2.4074074074074074, "grad_norm": 0.794555127620697, "learning_rate": 9.046884000609047e-05, "loss": 0.0698, "step": 260 }, { "epoch": 2.5, "grad_norm": 1.0664926767349243, "learning_rate": 8.955087574107137e-05, "loss": 0.0759, "step": 270 }, { "epoch": 2.5925925925925926, "grad_norm": 0.4675469398498535, "learning_rate": 8.859583254581605e-05, "loss": 0.0651, "step": 280 }, { "epoch": 2.685185185185185, "grad_norm": 0.973216712474823, "learning_rate": 8.760460577293921e-05, "loss": 0.0738, "step": 290 }, { "epoch": 2.7777777777777777, "grad_norm": 0.5992857813835144, "learning_rate": 8.657812469714518e-05, "loss": 0.0633, "step": 300 }, { "epoch": 2.8703703703703702, "grad_norm": 0.6723818778991699, "learning_rate": 8.55173516440332e-05, "loss": 0.075, "step": 310 }, { "epoch": 2.962962962962963, "grad_norm": 0.7937875390052795, "learning_rate": 8.442328108791759e-05, "loss": 0.0634, "step": 320 }, { "epoch": 3.0555555555555554, "grad_norm": 0.6974025964736938, "learning_rate": 8.329693871950843e-05, "loss": 0.0611, "step": 330 }, { "epoch": 3.148148148148148, "grad_norm": 0.8661198019981384, "learning_rate": 8.213938048432697e-05, "loss": 0.0625, "step": 340 }, { "epoch": 3.240740740740741, "grad_norm": 0.8185564875602722, "learning_rate": 8.095169159275713e-05, "loss": 0.0615, "step": 350 }, { "epoch": 3.3333333333333335, "grad_norm": 0.4486139714717865, "learning_rate": 7.973498550266115e-05, "loss": 0.0505, "step": 360 }, { "epoch": 3.425925925925926, "grad_norm": 0.7096815705299377, "learning_rate": 7.849040287551331e-05, "loss": 0.0557, "step": 370 }, { "epoch": 3.5185185185185186, "grad_norm": 0.6273751854896545, "learning_rate": 7.721911050703032e-05, "loss": 0.049, "step": 380 }, { "epoch": 3.611111111111111, "grad_norm": 0.5674566626548767, "learning_rate": 7.592230023330069e-05, "loss": 0.0541, "step": 390 }, { "epoch": 3.7037037037037037, "grad_norm": 0.6884677410125732, "learning_rate": 7.460118781343893e-05, "loss": 0.0572, "step": 400 }, { "epoch": 3.7962962962962963, "grad_norm": 0.512453019618988, "learning_rate": 7.325701178981184e-05, "loss": 0.0449, "step": 410 }, { "epoch": 3.888888888888889, "grad_norm": 0.49008965492248535, "learning_rate": 7.18910323269056e-05, "loss": 0.0386, "step": 420 }, { "epoch": 3.9814814814814814, "grad_norm": 0.5494919419288635, "learning_rate": 7.050453002992201e-05, "loss": 0.0511, "step": 430 }, { "epoch": 4.074074074074074, "grad_norm": 0.8010113835334778, "learning_rate": 6.90988047442116e-05, "loss": 0.049, "step": 440 }, { "epoch": 4.166666666666667, "grad_norm": 0.5579728484153748, "learning_rate": 6.767517433666918e-05, "loss": 0.053, "step": 450 }, { "epoch": 4.2592592592592595, "grad_norm": 0.6447908282279968, "learning_rate": 6.623497346023418e-05, "loss": 0.0537, "step": 460 }, { "epoch": 4.351851851851852, "grad_norm": 0.6082550883293152, "learning_rate": 6.477955230265394e-05, "loss": 0.0567, "step": 470 }, { "epoch": 4.444444444444445, "grad_norm": 0.7138601541519165, "learning_rate": 6.331027532068334e-05, "loss": 0.0524, "step": 480 }, { "epoch": 4.537037037037037, "grad_norm": 0.6580657958984375, "learning_rate": 6.182851996090713e-05, "loss": 0.0434, "step": 490 }, { "epoch": 4.62962962962963, "grad_norm": 0.49822911620140076, "learning_rate": 6.03356753683842e-05, "loss": 0.0426, "step": 500 }, { "epoch": 4.722222222222222, "grad_norm": 0.6944920420646667, "learning_rate": 5.88331410843248e-05, "loss": 0.044, "step": 510 }, { "epoch": 4.814814814814815, "grad_norm": 0.7584768533706665, "learning_rate": 5.7322325734021086e-05, "loss": 0.0432, "step": 520 }, { "epoch": 4.907407407407407, "grad_norm": 0.92609703540802, "learning_rate": 5.5804645706261514e-05, "loss": 0.0456, "step": 530 }, { "epoch": 5.0, "grad_norm": 0.9465698599815369, "learning_rate": 5.428152382546695e-05, "loss": 0.0413, "step": 540 }, { "epoch": 5.092592592592593, "grad_norm": 0.7870591878890991, "learning_rate": 5.2754388017793274e-05, "loss": 0.041, "step": 550 }, { "epoch": 5.185185185185185, "grad_norm": 0.5164716839790344, "learning_rate": 5.1224669972451245e-05, "loss": 0.0446, "step": 560 }, { "epoch": 5.277777777777778, "grad_norm": 0.6099451184272766, "learning_rate": 4.969380379949837e-05, "loss": 0.0442, "step": 570 }, { "epoch": 5.37037037037037, "grad_norm": 0.4600468575954437, "learning_rate": 4.816322468536139e-05, "loss": 0.0387, "step": 580 }, { "epoch": 5.462962962962963, "grad_norm": 0.34807872772216797, "learning_rate": 4.6634367547349436e-05, "loss": 0.0436, "step": 590 }, { "epoch": 5.555555555555555, "grad_norm": 0.48843276500701904, "learning_rate": 4.510866568841981e-05, "loss": 0.0323, "step": 600 }, { "epoch": 5.648148148148148, "grad_norm": 0.48683032393455505, "learning_rate": 4.358754945345684e-05, "loss": 0.0355, "step": 610 }, { "epoch": 5.7407407407407405, "grad_norm": 0.386310875415802, "learning_rate": 4.207244488832429e-05, "loss": 0.0316, "step": 620 }, { "epoch": 5.833333333333333, "grad_norm": 0.30438676476478577, "learning_rate": 4.056477240294779e-05, "loss": 0.0346, "step": 630 }, { "epoch": 5.925925925925926, "grad_norm": 0.495023250579834, "learning_rate": 3.9065945439681214e-05, "loss": 0.0342, "step": 640 }, { "epoch": 6.018518518518518, "grad_norm": 0.3270285129547119, "learning_rate": 3.7577369148204934e-05, "loss": 0.0328, "step": 650 }, { "epoch": 6.111111111111111, "grad_norm": 0.38187211751937866, "learning_rate": 3.610043906819868e-05, "loss": 0.0297, "step": 660 }, { "epoch": 6.203703703703703, "grad_norm": 0.4000631272792816, "learning_rate": 3.463653982102347e-05, "loss": 0.0297, "step": 670 }, { "epoch": 6.296296296296296, "grad_norm": 0.5061712861061096, "learning_rate": 3.318704381163986e-05, "loss": 0.0336, "step": 680 }, { "epoch": 6.388888888888889, "grad_norm": 0.3415641784667969, "learning_rate": 3.1753309941978616e-05, "loss": 0.031, "step": 690 }, { "epoch": 6.481481481481482, "grad_norm": 0.5254517793655396, "learning_rate": 3.0336682336970846e-05, "loss": 0.0353, "step": 700 }, { "epoch": 6.574074074074074, "grad_norm": 0.4749803841114044, "learning_rate": 2.8938489084431364e-05, "loss": 0.033, "step": 710 }, { "epoch": 6.666666666666667, "grad_norm": 0.4612673819065094, "learning_rate": 2.7560040989976892e-05, "loss": 0.0264, "step": 720 }, { "epoch": 6.7592592592592595, "grad_norm": 0.501693069934845, "learning_rate": 2.6202630348146324e-05, "loss": 0.0334, "step": 730 }, { "epoch": 6.851851851851852, "grad_norm": 0.5177351236343384, "learning_rate": 2.48675297308751e-05, "loss": 0.0292, "step": 740 }, { "epoch": 6.944444444444445, "grad_norm": 0.507723331451416, "learning_rate": 2.3555990794459542e-05, "loss": 0.0383, "step": 750 }, { "epoch": 7.037037037037037, "grad_norm": 0.24936509132385254, "learning_rate": 2.226924310612956e-05, "loss": 0.0285, "step": 760 }, { "epoch": 7.12962962962963, "grad_norm": 0.4175422787666321, "learning_rate": 2.1008492991329864e-05, "loss": 0.0328, "step": 770 }, { "epoch": 7.222222222222222, "grad_norm": 0.2943597137928009, "learning_rate": 1.9774922402790353e-05, "loss": 0.0321, "step": 780 }, { "epoch": 7.314814814814815, "grad_norm": 0.254304438829422, "learning_rate": 1.8569687812445896e-05, "loss": 0.0263, "step": 790 }, { "epoch": 7.407407407407407, "grad_norm": 0.27901774644851685, "learning_rate": 1.7393919127244347e-05, "loss": 0.0293, "step": 800 }, { "epoch": 7.5, "grad_norm": 0.3216925859451294, "learning_rate": 1.6248718629859244e-05, "loss": 0.0308, "step": 810 }, { "epoch": 7.592592592592593, "grad_norm": 0.46921682357788086, "learning_rate": 1.5135159945300231e-05, "loss": 0.0279, "step": 820 }, { "epoch": 7.685185185185185, "grad_norm": 0.4595056474208832, "learning_rate": 1.4054287034390046e-05, "loss": 0.0326, "step": 830 }, { "epoch": 7.777777777777778, "grad_norm": 0.49213147163391113, "learning_rate": 1.3007113215051675e-05, "loss": 0.0261, "step": 840 }, { "epoch": 7.87037037037037, "grad_norm": 0.2975919246673584, "learning_rate": 1.1994620212323177e-05, "loss": 0.0269, "step": 850 }, { "epoch": 7.962962962962963, "grad_norm": 0.4803672432899475, "learning_rate": 1.1017757237990878e-05, "loss": 0.0255, "step": 860 }, { "epoch": 8.055555555555555, "grad_norm": 0.4650854468345642, "learning_rate": 1.0077440100703683e-05, "loss": 0.0271, "step": 870 }, { "epoch": 8.148148148148149, "grad_norm": 0.3241572380065918, "learning_rate": 9.174550347402855e-06, "loss": 0.0227, "step": 880 }, { "epoch": 8.24074074074074, "grad_norm": 0.29282426834106445, "learning_rate": 8.309934436872074e-06, "loss": 0.0204, "step": 890 }, { "epoch": 8.333333333333334, "grad_norm": 0.19916000962257385, "learning_rate": 7.4844029461827e-06, "loss": 0.0223, "step": 900 }, { "epoch": 8.425925925925926, "grad_norm": 0.20894242823123932, "learning_rate": 6.698729810778065e-06, "loss": 0.0247, "step": 910 }, { "epoch": 8.518518518518519, "grad_norm": 0.36412182450294495, "learning_rate": 5.9536515989093325e-06, "loss": 0.0237, "step": 920 }, { "epoch": 8.61111111111111, "grad_norm": 0.295805424451828, "learning_rate": 5.249866821103016e-06, "loss": 0.0218, "step": 930 }, { "epoch": 8.703703703703704, "grad_norm": 0.3525577783584595, "learning_rate": 4.588035275307689e-06, "loss": 0.027, "step": 940 }, { "epoch": 8.796296296296296, "grad_norm": 0.28628063201904297, "learning_rate": 3.968777428333598e-06, "loss": 0.0281, "step": 950 }, { "epoch": 8.88888888888889, "grad_norm": 0.3433190882205963, "learning_rate": 3.3926738341653886e-06, "loss": 0.0242, "step": 960 }, { "epoch": 8.981481481481481, "grad_norm": 0.4827475845813751, "learning_rate": 2.8602645896928295e-06, "loss": 0.027, "step": 970 }, { "epoch": 9.074074074074074, "grad_norm": 0.262760728597641, "learning_rate": 2.3720488283703546e-06, "loss": 0.0236, "step": 980 }, { "epoch": 9.166666666666666, "grad_norm": 0.31685832142829895, "learning_rate": 1.9284842522794945e-06, "loss": 0.0269, "step": 990 }, { "epoch": 9.25925925925926, "grad_norm": 0.34328046441078186, "learning_rate": 1.5299867030334814e-06, "loss": 0.0256, "step": 1000 }, { "epoch": 9.351851851851851, "grad_norm": 0.4182087481021881, "learning_rate": 1.176929771925822e-06, "loss": 0.0221, "step": 1010 }, { "epoch": 9.444444444444445, "grad_norm": 0.19352613389492035, "learning_rate": 8.696444496886503e-07, "loss": 0.0312, "step": 1020 }, { "epoch": 9.537037037037036, "grad_norm": 0.26835450530052185, "learning_rate": 6.084188161890325e-07, "loss": 0.0217, "step": 1030 }, { "epoch": 9.62962962962963, "grad_norm": 0.26336586475372314, "learning_rate": 3.93497770354212e-07, "loss": 0.0217, "step": 1040 }, { "epoch": 9.722222222222221, "grad_norm": 0.34025803208351135, "learning_rate": 2.250828005789518e-07, "loss": 0.0237, "step": 1050 }, { "epoch": 9.814814814814815, "grad_norm": 0.3352171778678894, "learning_rate": 1.033317958302693e-07, "loss": 0.0214, "step": 1060 }, { "epoch": 9.907407407407408, "grad_norm": 0.19614988565444946, "learning_rate": 2.8358897626556968e-08, "loss": 0.0175, "step": 1070 }, { "epoch": 10.0, "grad_norm": 0.5722463130950928, "learning_rate": 2.3439302999639366e-10, "loss": 0.0238, "step": 1080 }, { "epoch": 10.0, "step": 1080, "total_flos": 0.0, "train_loss": 0.0688413071549601, "train_runtime": 1120.0938, "train_samples_per_second": 46.889, "train_steps_per_second": 0.964 } ], "logging_steps": 10, "max_steps": 1080, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 49, "trial_name": null, "trial_params": null }