{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 1755, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008547008547008548, "grad_norm": 2.3799763053026064, "learning_rate": 4.999995994512315e-06, "loss": 0.955, "step": 1 }, { "epoch": 0.017094017094017096, "grad_norm": 2.494033963310767, "learning_rate": 4.999983978062096e-06, "loss": 0.8287, "step": 2 }, { "epoch": 0.02564102564102564, "grad_norm": 2.201030171157517, "learning_rate": 4.999963950687846e-06, "loss": 0.9251, "step": 3 }, { "epoch": 0.03418803418803419, "grad_norm": 2.146415568844464, "learning_rate": 4.999935912453743e-06, "loss": 0.969, "step": 4 }, { "epoch": 0.042735042735042736, "grad_norm": 1.647445561456696, "learning_rate": 4.999899863449631e-06, "loss": 0.8622, "step": 5 }, { "epoch": 0.05128205128205128, "grad_norm": 1.6809896297017388, "learning_rate": 4.999855803791026e-06, "loss": 0.8451, "step": 6 }, { "epoch": 0.05982905982905983, "grad_norm": 1.5793650947747724, "learning_rate": 4.9998037336191115e-06, "loss": 0.8039, "step": 7 }, { "epoch": 0.06837606837606838, "grad_norm": 1.5869313841843558, "learning_rate": 4.9997436531007415e-06, "loss": 0.8729, "step": 8 }, { "epoch": 0.07692307692307693, "grad_norm": 1.597287174144046, "learning_rate": 4.999675562428437e-06, "loss": 0.8276, "step": 9 }, { "epoch": 0.08547008547008547, "grad_norm": 1.491091630883272, "learning_rate": 4.999599461820387e-06, "loss": 0.7985, "step": 10 }, { "epoch": 0.09401709401709402, "grad_norm": 1.4611263163645851, "learning_rate": 4.999515351520447e-06, "loss": 0.8432, "step": 11 }, { "epoch": 0.10256410256410256, "grad_norm": 1.5771492301572616, "learning_rate": 4.9994232317981405e-06, "loss": 0.9565, "step": 12 }, { "epoch": 0.1111111111111111, "grad_norm": 1.470477557035158, "learning_rate": 4.999323102948655e-06, "loss": 1.1248, "step": 13 }, { "epoch": 0.11965811965811966, "grad_norm": 1.1911774620668554, "learning_rate": 4.999214965292841e-06, "loss": 0.6717, "step": 14 }, { "epoch": 0.1282051282051282, "grad_norm": 1.3432981249016747, "learning_rate": 4.999098819177214e-06, "loss": 0.7159, "step": 15 }, { "epoch": 0.13675213675213677, "grad_norm": 1.3091938255082172, "learning_rate": 4.998974664973953e-06, "loss": 0.6966, "step": 16 }, { "epoch": 0.1452991452991453, "grad_norm": 1.4536201204300017, "learning_rate": 4.998842503080894e-06, "loss": 0.9169, "step": 17 }, { "epoch": 0.15384615384615385, "grad_norm": 1.2252488705593612, "learning_rate": 4.998702333921538e-06, "loss": 0.7068, "step": 18 }, { "epoch": 0.1623931623931624, "grad_norm": 1.1196522934351878, "learning_rate": 4.99855415794504e-06, "loss": 0.7144, "step": 19 }, { "epoch": 0.17094017094017094, "grad_norm": 1.4171269229549868, "learning_rate": 4.998397975626213e-06, "loss": 0.9539, "step": 20 }, { "epoch": 0.1794871794871795, "grad_norm": 1.2794264496219363, "learning_rate": 4.998233787465529e-06, "loss": 0.7078, "step": 21 }, { "epoch": 0.18803418803418803, "grad_norm": 1.3730284575172316, "learning_rate": 4.998061593989108e-06, "loss": 0.8275, "step": 22 }, { "epoch": 0.19658119658119658, "grad_norm": 1.4940223968438926, "learning_rate": 4.997881395748727e-06, "loss": 0.8149, "step": 23 }, { "epoch": 0.20512820512820512, "grad_norm": 1.127435221045007, "learning_rate": 4.99769319332181e-06, "loss": 0.7661, "step": 24 }, { "epoch": 0.21367521367521367, "grad_norm": 1.4374573099245262, "learning_rate": 4.997496987311431e-06, "loss": 0.8096, "step": 25 }, { "epoch": 0.2222222222222222, "grad_norm": 1.0383554614188748, "learning_rate": 4.997292778346312e-06, "loss": 0.6898, "step": 26 }, { "epoch": 0.23076923076923078, "grad_norm": 1.2454500227279588, "learning_rate": 4.9970805670808174e-06, "loss": 0.8476, "step": 27 }, { "epoch": 0.23931623931623933, "grad_norm": 1.3895431899110833, "learning_rate": 4.996860354194954e-06, "loss": 0.9476, "step": 28 }, { "epoch": 0.24786324786324787, "grad_norm": 1.2147090185053293, "learning_rate": 4.996632140394372e-06, "loss": 0.9277, "step": 29 }, { "epoch": 0.2564102564102564, "grad_norm": 1.201270924823543, "learning_rate": 4.996395926410354e-06, "loss": 0.7633, "step": 30 }, { "epoch": 0.26495726495726496, "grad_norm": 1.1116608939922576, "learning_rate": 4.996151712999826e-06, "loss": 0.8779, "step": 31 }, { "epoch": 0.27350427350427353, "grad_norm": 1.2255856149469802, "learning_rate": 4.995899500945341e-06, "loss": 0.6816, "step": 32 }, { "epoch": 0.28205128205128205, "grad_norm": 0.9559499921214146, "learning_rate": 4.995639291055084e-06, "loss": 0.7282, "step": 33 }, { "epoch": 0.2905982905982906, "grad_norm": 0.9736062146544777, "learning_rate": 4.99537108416287e-06, "loss": 0.706, "step": 34 }, { "epoch": 0.29914529914529914, "grad_norm": 1.1387327794469184, "learning_rate": 4.995094881128138e-06, "loss": 0.6738, "step": 35 }, { "epoch": 0.3076923076923077, "grad_norm": 1.248397867664662, "learning_rate": 4.994810682835951e-06, "loss": 0.7774, "step": 36 }, { "epoch": 0.3162393162393162, "grad_norm": 1.1882850864655046, "learning_rate": 4.99451849019699e-06, "loss": 0.5986, "step": 37 }, { "epoch": 0.3247863247863248, "grad_norm": 1.0587607675254118, "learning_rate": 4.994218304147556e-06, "loss": 0.7882, "step": 38 }, { "epoch": 0.3333333333333333, "grad_norm": 1.0022539150956298, "learning_rate": 4.993910125649561e-06, "loss": 0.8321, "step": 39 }, { "epoch": 0.3418803418803419, "grad_norm": 1.1127228219625331, "learning_rate": 4.993593955690529e-06, "loss": 0.6737, "step": 40 }, { "epoch": 0.3504273504273504, "grad_norm": 1.114475532217064, "learning_rate": 4.9932697952835925e-06, "loss": 0.7492, "step": 41 }, { "epoch": 0.358974358974359, "grad_norm": 1.04367701725325, "learning_rate": 4.992937645467489e-06, "loss": 0.8464, "step": 42 }, { "epoch": 0.36752136752136755, "grad_norm": 1.0727745473359473, "learning_rate": 4.992597507306552e-06, "loss": 1.0087, "step": 43 }, { "epoch": 0.37606837606837606, "grad_norm": 0.9660036788448103, "learning_rate": 4.992249381890722e-06, "loss": 0.7119, "step": 44 }, { "epoch": 0.38461538461538464, "grad_norm": 1.2125533839718292, "learning_rate": 4.991893270335526e-06, "loss": 1.0206, "step": 45 }, { "epoch": 0.39316239316239315, "grad_norm": 1.1273464632844163, "learning_rate": 4.9915291737820836e-06, "loss": 0.6818, "step": 46 }, { "epoch": 0.4017094017094017, "grad_norm": 1.0554418415896571, "learning_rate": 4.991157093397104e-06, "loss": 0.8175, "step": 47 }, { "epoch": 0.41025641025641024, "grad_norm": 1.0700579311552258, "learning_rate": 4.990777030372877e-06, "loss": 0.6876, "step": 48 }, { "epoch": 0.4188034188034188, "grad_norm": 1.0637403108101426, "learning_rate": 4.990388985927273e-06, "loss": 0.8423, "step": 49 }, { "epoch": 0.42735042735042733, "grad_norm": 0.9688437819111178, "learning_rate": 4.989992961303738e-06, "loss": 0.7794, "step": 50 }, { "epoch": 0.4358974358974359, "grad_norm": 1.0114236396805474, "learning_rate": 4.989588957771289e-06, "loss": 0.6794, "step": 51 }, { "epoch": 0.4444444444444444, "grad_norm": 1.0131329274100702, "learning_rate": 4.989176976624511e-06, "loss": 0.5956, "step": 52 }, { "epoch": 0.452991452991453, "grad_norm": 1.0333055990414326, "learning_rate": 4.988757019183553e-06, "loss": 0.6177, "step": 53 }, { "epoch": 0.46153846153846156, "grad_norm": 1.148102486098422, "learning_rate": 4.988329086794122e-06, "loss": 0.7744, "step": 54 }, { "epoch": 0.4700854700854701, "grad_norm": 1.3052841083061404, "learning_rate": 4.9878931808274796e-06, "loss": 0.644, "step": 55 }, { "epoch": 0.47863247863247865, "grad_norm": 1.0116165469923584, "learning_rate": 4.98744930268044e-06, "loss": 0.6745, "step": 56 }, { "epoch": 0.48717948717948717, "grad_norm": 1.086387211887458, "learning_rate": 4.986997453775361e-06, "loss": 0.736, "step": 57 }, { "epoch": 0.49572649572649574, "grad_norm": 0.9988882124617762, "learning_rate": 4.986537635560144e-06, "loss": 0.6586, "step": 58 }, { "epoch": 0.5042735042735043, "grad_norm": 1.0656418155038476, "learning_rate": 4.986069849508223e-06, "loss": 0.7999, "step": 59 }, { "epoch": 0.5128205128205128, "grad_norm": 1.1201336257983052, "learning_rate": 4.9855940971185705e-06, "loss": 0.7822, "step": 60 }, { "epoch": 0.5213675213675214, "grad_norm": 1.1598381779546625, "learning_rate": 4.985110379915681e-06, "loss": 0.764, "step": 61 }, { "epoch": 0.5299145299145299, "grad_norm": 1.0921954470681912, "learning_rate": 4.984618699449573e-06, "loss": 0.6829, "step": 62 }, { "epoch": 0.5384615384615384, "grad_norm": 0.9783029683124532, "learning_rate": 4.984119057295783e-06, "loss": 0.6498, "step": 63 }, { "epoch": 0.5470085470085471, "grad_norm": 1.076901000673284, "learning_rate": 4.983611455055359e-06, "loss": 0.667, "step": 64 }, { "epoch": 0.5555555555555556, "grad_norm": 1.2457502019896778, "learning_rate": 4.983095894354858e-06, "loss": 0.8111, "step": 65 }, { "epoch": 0.5641025641025641, "grad_norm": 1.0896425595072872, "learning_rate": 4.982572376846336e-06, "loss": 0.6329, "step": 66 }, { "epoch": 0.5726495726495726, "grad_norm": 0.9232081104327517, "learning_rate": 4.982040904207348e-06, "loss": 0.8773, "step": 67 }, { "epoch": 0.5811965811965812, "grad_norm": 0.9422862400984294, "learning_rate": 4.98150147814094e-06, "loss": 0.8834, "step": 68 }, { "epoch": 0.5897435897435898, "grad_norm": 1.1595435361807889, "learning_rate": 4.980954100375642e-06, "loss": 0.6244, "step": 69 }, { "epoch": 0.5982905982905983, "grad_norm": 1.146463819945862, "learning_rate": 4.980398772665468e-06, "loss": 0.6958, "step": 70 }, { "epoch": 0.6068376068376068, "grad_norm": 1.1240734279946885, "learning_rate": 4.979835496789904e-06, "loss": 0.7953, "step": 71 }, { "epoch": 0.6153846153846154, "grad_norm": 1.1655040421601977, "learning_rate": 4.979264274553906e-06, "loss": 0.9859, "step": 72 }, { "epoch": 0.6239316239316239, "grad_norm": 0.9972089985683658, "learning_rate": 4.97868510778789e-06, "loss": 0.7303, "step": 73 }, { "epoch": 0.6324786324786325, "grad_norm": 1.1284011355004735, "learning_rate": 4.978097998347737e-06, "loss": 0.861, "step": 74 }, { "epoch": 0.6410256410256411, "grad_norm": 1.050649795863996, "learning_rate": 4.977502948114772e-06, "loss": 0.7585, "step": 75 }, { "epoch": 0.6495726495726496, "grad_norm": 1.2055242119481449, "learning_rate": 4.9768999589957675e-06, "loss": 0.7491, "step": 76 }, { "epoch": 0.6581196581196581, "grad_norm": 1.045945265764331, "learning_rate": 4.976289032922937e-06, "loss": 0.8598, "step": 77 }, { "epoch": 0.6666666666666666, "grad_norm": 1.0154676826015707, "learning_rate": 4.975670171853926e-06, "loss": 0.9274, "step": 78 }, { "epoch": 0.6752136752136753, "grad_norm": 1.0849056329321192, "learning_rate": 4.975043377771806e-06, "loss": 0.6739, "step": 79 }, { "epoch": 0.6837606837606838, "grad_norm": 0.918648215949127, "learning_rate": 4.9744086526850724e-06, "loss": 0.723, "step": 80 }, { "epoch": 0.6923076923076923, "grad_norm": 1.269473123433632, "learning_rate": 4.973765998627628e-06, "loss": 0.9916, "step": 81 }, { "epoch": 0.7008547008547008, "grad_norm": 1.0609779535228994, "learning_rate": 4.97311541765879e-06, "loss": 0.8092, "step": 82 }, { "epoch": 0.7094017094017094, "grad_norm": 1.1075178803628833, "learning_rate": 4.972456911863273e-06, "loss": 0.8491, "step": 83 }, { "epoch": 0.717948717948718, "grad_norm": 1.1177775363091615, "learning_rate": 4.971790483351186e-06, "loss": 0.7181, "step": 84 }, { "epoch": 0.7264957264957265, "grad_norm": 1.19435718577341, "learning_rate": 4.971116134258026e-06, "loss": 0.7371, "step": 85 }, { "epoch": 0.7350427350427351, "grad_norm": 0.9344635761516649, "learning_rate": 4.97043386674467e-06, "loss": 0.8082, "step": 86 }, { "epoch": 0.7435897435897436, "grad_norm": 1.1047970864696917, "learning_rate": 4.969743682997372e-06, "loss": 0.6576, "step": 87 }, { "epoch": 0.7521367521367521, "grad_norm": 1.1492319889376637, "learning_rate": 4.969045585227747e-06, "loss": 0.7261, "step": 88 }, { "epoch": 0.7606837606837606, "grad_norm": 0.8540587595768252, "learning_rate": 4.968339575672773e-06, "loss": 0.7566, "step": 89 }, { "epoch": 0.7692307692307693, "grad_norm": 1.435559330737617, "learning_rate": 4.967625656594782e-06, "loss": 0.8515, "step": 90 }, { "epoch": 0.7777777777777778, "grad_norm": 1.0885688071755686, "learning_rate": 4.966903830281449e-06, "loss": 0.7278, "step": 91 }, { "epoch": 0.7863247863247863, "grad_norm": 1.0793773087579055, "learning_rate": 4.966174099045784e-06, "loss": 0.7646, "step": 92 }, { "epoch": 0.7948717948717948, "grad_norm": 1.192286892157816, "learning_rate": 4.9654364652261345e-06, "loss": 0.8218, "step": 93 }, { "epoch": 0.8034188034188035, "grad_norm": 0.9421147050497265, "learning_rate": 4.964690931186165e-06, "loss": 0.7428, "step": 94 }, { "epoch": 0.811965811965812, "grad_norm": 1.0300568563597792, "learning_rate": 4.963937499314857e-06, "loss": 0.9733, "step": 95 }, { "epoch": 0.8205128205128205, "grad_norm": 1.1507829257449622, "learning_rate": 4.963176172026501e-06, "loss": 0.8656, "step": 96 }, { "epoch": 0.8290598290598291, "grad_norm": 1.0732730564082225, "learning_rate": 4.962406951760687e-06, "loss": 0.6834, "step": 97 }, { "epoch": 0.8376068376068376, "grad_norm": 1.272187919122158, "learning_rate": 4.961629840982296e-06, "loss": 0.5806, "step": 98 }, { "epoch": 0.8461538461538461, "grad_norm": 1.0209482085094501, "learning_rate": 4.9608448421814944e-06, "loss": 0.593, "step": 99 }, { "epoch": 0.8547008547008547, "grad_norm": 1.070116471178533, "learning_rate": 4.960051957873726e-06, "loss": 0.7731, "step": 100 }, { "epoch": 0.8632478632478633, "grad_norm": 1.1286783767982944, "learning_rate": 4.959251190599699e-06, "loss": 0.9449, "step": 101 }, { "epoch": 0.8717948717948718, "grad_norm": 0.9613784562622458, "learning_rate": 4.958442542925385e-06, "loss": 0.5717, "step": 102 }, { "epoch": 0.8803418803418803, "grad_norm": 1.0834987740649913, "learning_rate": 4.9576260174420085e-06, "loss": 0.6655, "step": 103 }, { "epoch": 0.8888888888888888, "grad_norm": 1.1508010697308189, "learning_rate": 4.956801616766033e-06, "loss": 0.7036, "step": 104 }, { "epoch": 0.8974358974358975, "grad_norm": 0.857235837591077, "learning_rate": 4.955969343539162e-06, "loss": 0.7836, "step": 105 }, { "epoch": 0.905982905982906, "grad_norm": 0.9861436603055789, "learning_rate": 4.955129200428323e-06, "loss": 0.6834, "step": 106 }, { "epoch": 0.9145299145299145, "grad_norm": 1.0827086944967481, "learning_rate": 4.9542811901256615e-06, "loss": 0.7214, "step": 107 }, { "epoch": 0.9230769230769231, "grad_norm": 1.0999297259841632, "learning_rate": 4.953425315348534e-06, "loss": 0.6893, "step": 108 }, { "epoch": 0.9316239316239316, "grad_norm": 1.22960169622494, "learning_rate": 4.952561578839498e-06, "loss": 0.7697, "step": 109 }, { "epoch": 0.9401709401709402, "grad_norm": 1.0578878999566572, "learning_rate": 4.9516899833663e-06, "loss": 0.7045, "step": 110 }, { "epoch": 0.9487179487179487, "grad_norm": 1.1846780679296018, "learning_rate": 4.950810531721874e-06, "loss": 0.7342, "step": 111 }, { "epoch": 0.9572649572649573, "grad_norm": 1.1912506798196463, "learning_rate": 4.949923226724325e-06, "loss": 0.8233, "step": 112 }, { "epoch": 0.9658119658119658, "grad_norm": 1.1252852469907122, "learning_rate": 4.949028071216926e-06, "loss": 0.624, "step": 113 }, { "epoch": 0.9743589743589743, "grad_norm": 1.1517000451843173, "learning_rate": 4.948125068068102e-06, "loss": 0.544, "step": 114 }, { "epoch": 0.9829059829059829, "grad_norm": 1.060592964482793, "learning_rate": 4.94721422017143e-06, "loss": 0.959, "step": 115 }, { "epoch": 0.9914529914529915, "grad_norm": 1.0042046551638841, "learning_rate": 4.946295530445621e-06, "loss": 0.7238, "step": 116 }, { "epoch": 1.0, "grad_norm": 1.0001249481240222, "learning_rate": 4.9453690018345144e-06, "loss": 0.619, "step": 117 }, { "epoch": 1.0085470085470085, "grad_norm": 0.9380046535719442, "learning_rate": 4.94443463730707e-06, "loss": 0.6444, "step": 118 }, { "epoch": 1.017094017094017, "grad_norm": 1.0293011172402107, "learning_rate": 4.943492439857357e-06, "loss": 0.7716, "step": 119 }, { "epoch": 1.0256410256410255, "grad_norm": 0.9855467678686244, "learning_rate": 4.942542412504543e-06, "loss": 0.7719, "step": 120 }, { "epoch": 1.0341880341880343, "grad_norm": 1.1807086649027472, "learning_rate": 4.9415845582928866e-06, "loss": 0.6453, "step": 121 }, { "epoch": 1.0427350427350428, "grad_norm": 1.1780514473878632, "learning_rate": 4.940618880291725e-06, "loss": 0.9361, "step": 122 }, { "epoch": 1.0512820512820513, "grad_norm": 1.0138477857952197, "learning_rate": 4.9396453815954695e-06, "loss": 0.5969, "step": 123 }, { "epoch": 1.0598290598290598, "grad_norm": 0.9411679288409218, "learning_rate": 4.938664065323588e-06, "loss": 0.6425, "step": 124 }, { "epoch": 1.0683760683760684, "grad_norm": 1.0955596194799122, "learning_rate": 4.937674934620601e-06, "loss": 0.7439, "step": 125 }, { "epoch": 1.0769230769230769, "grad_norm": 1.2132702603532166, "learning_rate": 4.9366779926560705e-06, "loss": 0.7427, "step": 126 }, { "epoch": 1.0854700854700854, "grad_norm": 0.8719242281374018, "learning_rate": 4.935673242624585e-06, "loss": 0.5546, "step": 127 }, { "epoch": 1.0940170940170941, "grad_norm": 0.913689131986586, "learning_rate": 4.934660687745758e-06, "loss": 0.6433, "step": 128 }, { "epoch": 1.1025641025641026, "grad_norm": 1.0562719427443, "learning_rate": 4.93364033126421e-06, "loss": 0.6447, "step": 129 }, { "epoch": 1.1111111111111112, "grad_norm": 1.1406761768492368, "learning_rate": 4.93261217644956e-06, "loss": 0.7865, "step": 130 }, { "epoch": 1.1196581196581197, "grad_norm": 1.0287132018048986, "learning_rate": 4.931576226596418e-06, "loss": 0.8902, "step": 131 }, { "epoch": 1.1282051282051282, "grad_norm": 1.0679366877890835, "learning_rate": 4.930532485024372e-06, "loss": 0.5658, "step": 132 }, { "epoch": 1.1367521367521367, "grad_norm": 1.1236922170795136, "learning_rate": 4.929480955077976e-06, "loss": 0.5752, "step": 133 }, { "epoch": 1.1452991452991452, "grad_norm": 0.8853014304117948, "learning_rate": 4.928421640126742e-06, "loss": 0.5763, "step": 134 }, { "epoch": 1.1538461538461537, "grad_norm": 0.9775295301218139, "learning_rate": 4.927354543565131e-06, "loss": 0.6277, "step": 135 }, { "epoch": 1.1623931623931625, "grad_norm": 0.9579962790913475, "learning_rate": 4.926279668812533e-06, "loss": 0.8088, "step": 136 }, { "epoch": 1.170940170940171, "grad_norm": 1.1435685522698529, "learning_rate": 4.925197019313269e-06, "loss": 0.697, "step": 137 }, { "epoch": 1.1794871794871795, "grad_norm": 0.870824019840728, "learning_rate": 4.9241065985365695e-06, "loss": 0.6292, "step": 138 }, { "epoch": 1.188034188034188, "grad_norm": 1.0983119201797829, "learning_rate": 4.923008409976568e-06, "loss": 0.7311, "step": 139 }, { "epoch": 1.1965811965811965, "grad_norm": 1.0305388966879632, "learning_rate": 4.921902457152289e-06, "loss": 0.7046, "step": 140 }, { "epoch": 1.205128205128205, "grad_norm": 1.119605051218424, "learning_rate": 4.920788743607636e-06, "loss": 0.843, "step": 141 }, { "epoch": 1.2136752136752136, "grad_norm": 0.915923341125377, "learning_rate": 4.919667272911383e-06, "loss": 0.6354, "step": 142 }, { "epoch": 1.2222222222222223, "grad_norm": 0.9108503566123858, "learning_rate": 4.91853804865716e-06, "loss": 0.7452, "step": 143 }, { "epoch": 1.2307692307692308, "grad_norm": 0.9004822052884224, "learning_rate": 4.917401074463441e-06, "loss": 0.7132, "step": 144 }, { "epoch": 1.2393162393162394, "grad_norm": 1.0063673480230306, "learning_rate": 4.916256353973535e-06, "loss": 0.6438, "step": 145 }, { "epoch": 1.2478632478632479, "grad_norm": 0.9421746838568855, "learning_rate": 4.915103890855574e-06, "loss": 0.578, "step": 146 }, { "epoch": 1.2564102564102564, "grad_norm": 1.0676892185226952, "learning_rate": 4.913943688802497e-06, "loss": 0.6855, "step": 147 }, { "epoch": 1.264957264957265, "grad_norm": 1.1188578607492226, "learning_rate": 4.912775751532047e-06, "loss": 0.7427, "step": 148 }, { "epoch": 1.2735042735042734, "grad_norm": 0.9469435476035265, "learning_rate": 4.91160008278675e-06, "loss": 0.5891, "step": 149 }, { "epoch": 1.282051282051282, "grad_norm": 1.0167095611518875, "learning_rate": 4.9104166863339065e-06, "loss": 0.7139, "step": 150 }, { "epoch": 1.2905982905982907, "grad_norm": 1.0458260632626932, "learning_rate": 4.90922556596558e-06, "loss": 0.874, "step": 151 }, { "epoch": 1.2991452991452992, "grad_norm": 1.0453297709360865, "learning_rate": 4.908026725498586e-06, "loss": 0.5972, "step": 152 }, { "epoch": 1.3076923076923077, "grad_norm": 1.0265238458385364, "learning_rate": 4.9068201687744774e-06, "loss": 0.6141, "step": 153 }, { "epoch": 1.3162393162393162, "grad_norm": 0.9359503022694231, "learning_rate": 4.905605899659532e-06, "loss": 0.5768, "step": 154 }, { "epoch": 1.3247863247863247, "grad_norm": 1.1684603854006186, "learning_rate": 4.90438392204474e-06, "loss": 0.7656, "step": 155 }, { "epoch": 1.3333333333333333, "grad_norm": 1.0501902958929783, "learning_rate": 4.903154239845798e-06, "loss": 0.7487, "step": 156 }, { "epoch": 1.341880341880342, "grad_norm": 1.1842195482922369, "learning_rate": 4.901916857003084e-06, "loss": 0.8917, "step": 157 }, { "epoch": 1.3504273504273505, "grad_norm": 0.9216838395103608, "learning_rate": 4.9006717774816585e-06, "loss": 0.9301, "step": 158 }, { "epoch": 1.358974358974359, "grad_norm": 1.1773273648490519, "learning_rate": 4.8994190052712406e-06, "loss": 0.4604, "step": 159 }, { "epoch": 1.3675213675213675, "grad_norm": 0.9974738324004961, "learning_rate": 4.898158544386201e-06, "loss": 0.8768, "step": 160 }, { "epoch": 1.376068376068376, "grad_norm": 0.9555997535951641, "learning_rate": 4.896890398865548e-06, "loss": 0.5342, "step": 161 }, { "epoch": 1.3846153846153846, "grad_norm": 1.0211237936640918, "learning_rate": 4.895614572772916e-06, "loss": 0.5775, "step": 162 }, { "epoch": 1.393162393162393, "grad_norm": 1.0818255776569934, "learning_rate": 4.894331070196548e-06, "loss": 0.7447, "step": 163 }, { "epoch": 1.4017094017094016, "grad_norm": 0.9298860880587865, "learning_rate": 4.893039895249288e-06, "loss": 0.5779, "step": 164 }, { "epoch": 1.4102564102564101, "grad_norm": 1.0062273244598126, "learning_rate": 4.8917410520685635e-06, "loss": 0.6862, "step": 165 }, { "epoch": 1.4188034188034189, "grad_norm": 1.0998490770749472, "learning_rate": 4.890434544816375e-06, "loss": 0.6407, "step": 166 }, { "epoch": 1.4273504273504274, "grad_norm": 1.0565419203304869, "learning_rate": 4.889120377679282e-06, "loss": 0.6379, "step": 167 }, { "epoch": 1.435897435897436, "grad_norm": 1.0792399462153217, "learning_rate": 4.887798554868388e-06, "loss": 0.6028, "step": 168 }, { "epoch": 1.4444444444444444, "grad_norm": 1.0475389343787085, "learning_rate": 4.88646908061933e-06, "loss": 0.586, "step": 169 }, { "epoch": 1.452991452991453, "grad_norm": 1.0323292401835535, "learning_rate": 4.885131959192262e-06, "loss": 0.6262, "step": 170 }, { "epoch": 1.4615384615384617, "grad_norm": 1.1577807260074058, "learning_rate": 4.883787194871841e-06, "loss": 0.6364, "step": 171 }, { "epoch": 1.4700854700854702, "grad_norm": 0.8278225378608461, "learning_rate": 4.882434791967219e-06, "loss": 0.5483, "step": 172 }, { "epoch": 1.4786324786324787, "grad_norm": 1.1193607717443066, "learning_rate": 4.881074754812021e-06, "loss": 0.5526, "step": 173 }, { "epoch": 1.4871794871794872, "grad_norm": 1.1417530862216305, "learning_rate": 4.879707087764336e-06, "loss": 0.6403, "step": 174 }, { "epoch": 1.4957264957264957, "grad_norm": 0.9659694011039861, "learning_rate": 4.878331795206705e-06, "loss": 0.7537, "step": 175 }, { "epoch": 1.5042735042735043, "grad_norm": 1.0693878337410088, "learning_rate": 4.876948881546101e-06, "loss": 0.6976, "step": 176 }, { "epoch": 1.5128205128205128, "grad_norm": 1.0498589499731368, "learning_rate": 4.875558351213918e-06, "loss": 0.7038, "step": 177 }, { "epoch": 1.5213675213675213, "grad_norm": 1.1001761941159536, "learning_rate": 4.874160208665958e-06, "loss": 0.4691, "step": 178 }, { "epoch": 1.5299145299145298, "grad_norm": 1.2397955357433883, "learning_rate": 4.872754458382416e-06, "loss": 0.6688, "step": 179 }, { "epoch": 1.5384615384615383, "grad_norm": 0.9672667050888256, "learning_rate": 4.8713411048678635e-06, "loss": 0.8776, "step": 180 }, { "epoch": 1.547008547008547, "grad_norm": 1.051939715180058, "learning_rate": 4.869920152651239e-06, "loss": 0.8041, "step": 181 }, { "epoch": 1.5555555555555556, "grad_norm": 1.3625309523663536, "learning_rate": 4.868491606285823e-06, "loss": 0.8096, "step": 182 }, { "epoch": 1.564102564102564, "grad_norm": 1.1734670133598417, "learning_rate": 4.86705547034924e-06, "loss": 0.6965, "step": 183 }, { "epoch": 1.5726495726495726, "grad_norm": 1.0740816983559884, "learning_rate": 4.865611749443428e-06, "loss": 0.6973, "step": 184 }, { "epoch": 1.5811965811965814, "grad_norm": 0.8760799984387524, "learning_rate": 4.864160448194632e-06, "loss": 0.5856, "step": 185 }, { "epoch": 1.5897435897435899, "grad_norm": 1.184991710617603, "learning_rate": 4.862701571253387e-06, "loss": 0.5975, "step": 186 }, { "epoch": 1.5982905982905984, "grad_norm": 0.9980320434419719, "learning_rate": 4.861235123294505e-06, "loss": 0.595, "step": 187 }, { "epoch": 1.606837606837607, "grad_norm": 1.2814407369839131, "learning_rate": 4.859761109017056e-06, "loss": 0.5688, "step": 188 }, { "epoch": 1.6153846153846154, "grad_norm": 1.0094832294279448, "learning_rate": 4.858279533144358e-06, "loss": 0.671, "step": 189 }, { "epoch": 1.623931623931624, "grad_norm": 1.1444371260857773, "learning_rate": 4.856790400423958e-06, "loss": 0.6731, "step": 190 }, { "epoch": 1.6324786324786325, "grad_norm": 1.1115974299621325, "learning_rate": 4.8552937156276185e-06, "loss": 0.5646, "step": 191 }, { "epoch": 1.641025641025641, "grad_norm": 1.03171495160948, "learning_rate": 4.8537894835513e-06, "loss": 0.973, "step": 192 }, { "epoch": 1.6495726495726495, "grad_norm": 0.8294802712494503, "learning_rate": 4.8522777090151505e-06, "loss": 0.5523, "step": 193 }, { "epoch": 1.658119658119658, "grad_norm": 0.9531056635949867, "learning_rate": 4.8507583968634845e-06, "loss": 0.6492, "step": 194 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0522126505451395, "learning_rate": 4.849231551964771e-06, "loss": 0.6263, "step": 195 }, { "epoch": 1.6752136752136753, "grad_norm": 1.2954172102399524, "learning_rate": 4.847697179211618e-06, "loss": 0.5302, "step": 196 }, { "epoch": 1.6837606837606838, "grad_norm": 1.1470853197940476, "learning_rate": 4.8461552835207524e-06, "loss": 0.6199, "step": 197 }, { "epoch": 1.6923076923076923, "grad_norm": 0.8217311790990888, "learning_rate": 4.844605869833011e-06, "loss": 0.5702, "step": 198 }, { "epoch": 1.7008547008547008, "grad_norm": 1.0740164329626947, "learning_rate": 4.84304894311332e-06, "loss": 0.8951, "step": 199 }, { "epoch": 1.7094017094017095, "grad_norm": 1.0571598154246917, "learning_rate": 4.841484508350679e-06, "loss": 0.5311, "step": 200 }, { "epoch": 1.717948717948718, "grad_norm": 1.0253573731407604, "learning_rate": 4.839912570558148e-06, "loss": 0.5615, "step": 201 }, { "epoch": 1.7264957264957266, "grad_norm": 1.1096555314448195, "learning_rate": 4.838333134772828e-06, "loss": 0.5405, "step": 202 }, { "epoch": 1.735042735042735, "grad_norm": 1.1490180420824463, "learning_rate": 4.836746206055849e-06, "loss": 0.5965, "step": 203 }, { "epoch": 1.7435897435897436, "grad_norm": 1.248513432955595, "learning_rate": 4.835151789492348e-06, "loss": 0.6728, "step": 204 }, { "epoch": 1.7521367521367521, "grad_norm": 1.0730022039252907, "learning_rate": 4.83354989019146e-06, "loss": 0.5746, "step": 205 }, { "epoch": 1.7606837606837606, "grad_norm": 1.1292781253242146, "learning_rate": 4.831940513286293e-06, "loss": 0.7255, "step": 206 }, { "epoch": 1.7692307692307692, "grad_norm": 1.2114916668909483, "learning_rate": 4.83032366393392e-06, "loss": 0.6832, "step": 207 }, { "epoch": 1.7777777777777777, "grad_norm": 1.0038198857460197, "learning_rate": 4.828699347315357e-06, "loss": 0.763, "step": 208 }, { "epoch": 1.7863247863247862, "grad_norm": 1.2044564952881287, "learning_rate": 4.827067568635546e-06, "loss": 0.5676, "step": 209 }, { "epoch": 1.7948717948717947, "grad_norm": 1.1783964857396678, "learning_rate": 4.825428333123346e-06, "loss": 0.749, "step": 210 }, { "epoch": 1.8034188034188035, "grad_norm": 1.2129952692965869, "learning_rate": 4.823781646031505e-06, "loss": 0.744, "step": 211 }, { "epoch": 1.811965811965812, "grad_norm": 1.1997039791188244, "learning_rate": 4.822127512636652e-06, "loss": 0.4263, "step": 212 }, { "epoch": 1.8205128205128205, "grad_norm": 1.2801580897822806, "learning_rate": 4.820465938239274e-06, "loss": 0.7157, "step": 213 }, { "epoch": 1.8290598290598292, "grad_norm": 1.2201502360145773, "learning_rate": 4.8187969281637054e-06, "loss": 0.6941, "step": 214 }, { "epoch": 1.8376068376068377, "grad_norm": 1.0696558627600015, "learning_rate": 4.817120487758105e-06, "loss": 0.6869, "step": 215 }, { "epoch": 1.8461538461538463, "grad_norm": 0.9738340449626065, "learning_rate": 4.815436622394442e-06, "loss": 0.5687, "step": 216 }, { "epoch": 1.8547008547008548, "grad_norm": 1.0884107651937243, "learning_rate": 4.813745337468478e-06, "loss": 0.8293, "step": 217 }, { "epoch": 1.8632478632478633, "grad_norm": 1.0301313414697264, "learning_rate": 4.8120466383997486e-06, "loss": 0.631, "step": 218 }, { "epoch": 1.8717948717948718, "grad_norm": 1.162851200863082, "learning_rate": 4.81034053063155e-06, "loss": 0.6991, "step": 219 }, { "epoch": 1.8803418803418803, "grad_norm": 1.0191227737123674, "learning_rate": 4.8086270196309174e-06, "loss": 0.869, "step": 220 }, { "epoch": 1.8888888888888888, "grad_norm": 1.145852959901522, "learning_rate": 4.806906110888606e-06, "loss": 0.6711, "step": 221 }, { "epoch": 1.8974358974358974, "grad_norm": 1.0405069802093607, "learning_rate": 4.805177809919081e-06, "loss": 0.7719, "step": 222 }, { "epoch": 1.9059829059829059, "grad_norm": 1.187095805002808, "learning_rate": 4.803442122260494e-06, "loss": 0.7155, "step": 223 }, { "epoch": 1.9145299145299144, "grad_norm": 1.1479226660380026, "learning_rate": 4.801699053474663e-06, "loss": 0.5877, "step": 224 }, { "epoch": 1.9230769230769231, "grad_norm": 1.0558651526719405, "learning_rate": 4.799948609147061e-06, "loss": 0.5684, "step": 225 }, { "epoch": 1.9316239316239316, "grad_norm": 1.0038732472086436, "learning_rate": 4.798190794886795e-06, "loss": 0.7175, "step": 226 }, { "epoch": 1.9401709401709402, "grad_norm": 0.9248964570130919, "learning_rate": 4.796425616326588e-06, "loss": 0.5006, "step": 227 }, { "epoch": 1.9487179487179487, "grad_norm": 1.2300999080227526, "learning_rate": 4.79465307912276e-06, "loss": 0.6256, "step": 228 }, { "epoch": 1.9572649572649574, "grad_norm": 1.0417084674820174, "learning_rate": 4.792873188955213e-06, "loss": 0.8554, "step": 229 }, { "epoch": 1.965811965811966, "grad_norm": 1.2349753774243406, "learning_rate": 4.791085951527408e-06, "loss": 0.5971, "step": 230 }, { "epoch": 1.9743589743589745, "grad_norm": 0.9778954080850863, "learning_rate": 4.789291372566352e-06, "loss": 0.696, "step": 231 }, { "epoch": 1.982905982905983, "grad_norm": 1.0172802549418838, "learning_rate": 4.787489457822576e-06, "loss": 0.6935, "step": 232 }, { "epoch": 1.9914529914529915, "grad_norm": 1.1202625109201434, "learning_rate": 4.785680213070117e-06, "loss": 0.9207, "step": 233 }, { "epoch": 2.0, "grad_norm": 1.0820721665308195, "learning_rate": 4.783863644106502e-06, "loss": 0.6833, "step": 234 }, { "epoch": 2.0085470085470085, "grad_norm": 1.1358440696728398, "learning_rate": 4.782039756752728e-06, "loss": 0.618, "step": 235 }, { "epoch": 2.017094017094017, "grad_norm": 1.0457159806264371, "learning_rate": 4.780208556853239e-06, "loss": 0.4722, "step": 236 }, { "epoch": 2.0256410256410255, "grad_norm": 1.029809279617813, "learning_rate": 4.7783700502759145e-06, "loss": 0.656, "step": 237 }, { "epoch": 2.034188034188034, "grad_norm": 1.103878378087795, "learning_rate": 4.776524242912047e-06, "loss": 0.7855, "step": 238 }, { "epoch": 2.0427350427350426, "grad_norm": 1.0267266691810852, "learning_rate": 4.774671140676325e-06, "loss": 0.6245, "step": 239 }, { "epoch": 2.051282051282051, "grad_norm": 1.1132991520998483, "learning_rate": 4.77281074950681e-06, "loss": 0.9077, "step": 240 }, { "epoch": 2.0598290598290596, "grad_norm": 1.0667495151776363, "learning_rate": 4.77094307536492e-06, "loss": 0.5684, "step": 241 }, { "epoch": 2.0683760683760686, "grad_norm": 1.0054382245506719, "learning_rate": 4.769068124235413e-06, "loss": 0.4944, "step": 242 }, { "epoch": 2.076923076923077, "grad_norm": 0.9198876553952378, "learning_rate": 4.7671859021263635e-06, "loss": 0.7061, "step": 243 }, { "epoch": 2.0854700854700856, "grad_norm": 1.0023794715887067, "learning_rate": 4.765296415069146e-06, "loss": 0.6205, "step": 244 }, { "epoch": 2.094017094017094, "grad_norm": 1.0373648195951122, "learning_rate": 4.763399669118414e-06, "loss": 0.794, "step": 245 }, { "epoch": 2.1025641025641026, "grad_norm": 0.9929772876721678, "learning_rate": 4.761495670352081e-06, "loss": 0.5498, "step": 246 }, { "epoch": 2.111111111111111, "grad_norm": 1.0991873012161186, "learning_rate": 4.759584424871302e-06, "loss": 0.6007, "step": 247 }, { "epoch": 2.1196581196581197, "grad_norm": 0.9826836883278143, "learning_rate": 4.757665938800453e-06, "loss": 0.7296, "step": 248 }, { "epoch": 2.128205128205128, "grad_norm": 1.0917471368807863, "learning_rate": 4.755740218287113e-06, "loss": 0.3556, "step": 249 }, { "epoch": 2.1367521367521367, "grad_norm": 1.0825910621465624, "learning_rate": 4.753807269502041e-06, "loss": 0.664, "step": 250 }, { "epoch": 2.1452991452991452, "grad_norm": 0.8905203613513742, "learning_rate": 4.7518670986391576e-06, "loss": 0.4741, "step": 251 }, { "epoch": 2.1538461538461537, "grad_norm": 0.9127232740781913, "learning_rate": 4.749919711915531e-06, "loss": 0.6486, "step": 252 }, { "epoch": 2.1623931623931623, "grad_norm": 1.2654836138848309, "learning_rate": 4.747965115571345e-06, "loss": 0.6307, "step": 253 }, { "epoch": 2.1709401709401708, "grad_norm": 1.049141772610685, "learning_rate": 4.746003315869889e-06, "loss": 0.6238, "step": 254 }, { "epoch": 2.1794871794871793, "grad_norm": 1.2805011771546446, "learning_rate": 4.744034319097536e-06, "loss": 0.6115, "step": 255 }, { "epoch": 2.1880341880341883, "grad_norm": 1.0625960491407178, "learning_rate": 4.742058131563718e-06, "loss": 0.6182, "step": 256 }, { "epoch": 2.1965811965811968, "grad_norm": 1.1293658867526906, "learning_rate": 4.7400747596009125e-06, "loss": 0.5825, "step": 257 }, { "epoch": 2.2051282051282053, "grad_norm": 0.9590474269446825, "learning_rate": 4.738084209564617e-06, "loss": 0.5145, "step": 258 }, { "epoch": 2.213675213675214, "grad_norm": 0.8835052747120326, "learning_rate": 4.73608648783333e-06, "loss": 0.5783, "step": 259 }, { "epoch": 2.2222222222222223, "grad_norm": 1.0657600277339365, "learning_rate": 4.734081600808531e-06, "loss": 0.7102, "step": 260 }, { "epoch": 2.230769230769231, "grad_norm": 1.1192032409276582, "learning_rate": 4.73206955491466e-06, "loss": 0.5007, "step": 261 }, { "epoch": 2.2393162393162394, "grad_norm": 1.056716369944498, "learning_rate": 4.7300503565990985e-06, "loss": 0.5169, "step": 262 }, { "epoch": 2.247863247863248, "grad_norm": 0.9269001645142524, "learning_rate": 4.728024012332145e-06, "loss": 0.4769, "step": 263 }, { "epoch": 2.2564102564102564, "grad_norm": 1.0440329703988538, "learning_rate": 4.725990528606996e-06, "loss": 0.4783, "step": 264 }, { "epoch": 2.264957264957265, "grad_norm": 1.0105785765055395, "learning_rate": 4.723949911939728e-06, "loss": 0.618, "step": 265 }, { "epoch": 2.2735042735042734, "grad_norm": 1.1776085377093957, "learning_rate": 4.7219021688692725e-06, "loss": 0.6379, "step": 266 }, { "epoch": 2.282051282051282, "grad_norm": 1.2314287795636818, "learning_rate": 4.719847305957398e-06, "loss": 0.6684, "step": 267 }, { "epoch": 2.2905982905982905, "grad_norm": 1.1004054117724542, "learning_rate": 4.717785329788685e-06, "loss": 0.7569, "step": 268 }, { "epoch": 2.299145299145299, "grad_norm": 1.08262338980394, "learning_rate": 4.715716246970511e-06, "loss": 0.7849, "step": 269 }, { "epoch": 2.3076923076923075, "grad_norm": 0.9736885767195094, "learning_rate": 4.7136400641330245e-06, "loss": 0.6033, "step": 270 }, { "epoch": 2.316239316239316, "grad_norm": 0.9932485252912346, "learning_rate": 4.7115567879291265e-06, "loss": 0.5402, "step": 271 }, { "epoch": 2.324786324786325, "grad_norm": 0.8913116656138574, "learning_rate": 4.709466425034445e-06, "loss": 0.6844, "step": 272 }, { "epoch": 2.3333333333333335, "grad_norm": 1.0793309402123525, "learning_rate": 4.707368982147318e-06, "loss": 0.6513, "step": 273 }, { "epoch": 2.341880341880342, "grad_norm": 0.9562775991017588, "learning_rate": 4.705264465988771e-06, "loss": 0.6892, "step": 274 }, { "epoch": 2.3504273504273505, "grad_norm": 1.1089663383001975, "learning_rate": 4.703152883302498e-06, "loss": 0.4878, "step": 275 }, { "epoch": 2.358974358974359, "grad_norm": 0.9548004271587162, "learning_rate": 4.701034240854829e-06, "loss": 0.6004, "step": 276 }, { "epoch": 2.3675213675213675, "grad_norm": 1.151717398068187, "learning_rate": 4.6989085454347236e-06, "loss": 0.6388, "step": 277 }, { "epoch": 2.376068376068376, "grad_norm": 1.110544374011295, "learning_rate": 4.696775803853739e-06, "loss": 0.6309, "step": 278 }, { "epoch": 2.3846153846153846, "grad_norm": 1.018520072977354, "learning_rate": 4.694636022946012e-06, "loss": 0.6167, "step": 279 }, { "epoch": 2.393162393162393, "grad_norm": 1.0420288996844116, "learning_rate": 4.692489209568234e-06, "loss": 0.7685, "step": 280 }, { "epoch": 2.4017094017094016, "grad_norm": 1.08086419442703, "learning_rate": 4.690335370599633e-06, "loss": 0.6983, "step": 281 }, { "epoch": 2.41025641025641, "grad_norm": 0.8754314274596378, "learning_rate": 4.68817451294195e-06, "loss": 0.5258, "step": 282 }, { "epoch": 2.4188034188034186, "grad_norm": 0.9691136068755867, "learning_rate": 4.686006643519415e-06, "loss": 0.715, "step": 283 }, { "epoch": 2.427350427350427, "grad_norm": 1.0533303155651648, "learning_rate": 4.683831769278729e-06, "loss": 0.6357, "step": 284 }, { "epoch": 2.435897435897436, "grad_norm": 1.0091984594483059, "learning_rate": 4.681649897189036e-06, "loss": 0.6379, "step": 285 }, { "epoch": 2.4444444444444446, "grad_norm": 1.0015146496481933, "learning_rate": 4.679461034241906e-06, "loss": 0.6303, "step": 286 }, { "epoch": 2.452991452991453, "grad_norm": 1.0754076483476527, "learning_rate": 4.677265187451311e-06, "loss": 0.5038, "step": 287 }, { "epoch": 2.4615384615384617, "grad_norm": 0.9176429719414579, "learning_rate": 4.675062363853599e-06, "loss": 0.6666, "step": 288 }, { "epoch": 2.47008547008547, "grad_norm": 0.9478045791249746, "learning_rate": 4.672852570507476e-06, "loss": 0.4181, "step": 289 }, { "epoch": 2.4786324786324787, "grad_norm": 0.9852262013119069, "learning_rate": 4.670635814493985e-06, "loss": 0.5489, "step": 290 }, { "epoch": 2.4871794871794872, "grad_norm": 1.0021736330504551, "learning_rate": 4.668412102916474e-06, "loss": 0.5666, "step": 291 }, { "epoch": 2.4957264957264957, "grad_norm": 1.1968080215087806, "learning_rate": 4.666181442900584e-06, "loss": 0.7247, "step": 292 }, { "epoch": 2.5042735042735043, "grad_norm": 1.40060050962396, "learning_rate": 4.663943841594219e-06, "loss": 0.876, "step": 293 }, { "epoch": 2.5128205128205128, "grad_norm": 1.083324854920149, "learning_rate": 4.6616993061675275e-06, "loss": 0.5184, "step": 294 }, { "epoch": 2.5213675213675213, "grad_norm": 1.028281360150688, "learning_rate": 4.659447843812876e-06, "loss": 0.6421, "step": 295 }, { "epoch": 2.52991452991453, "grad_norm": 1.087837641665171, "learning_rate": 4.657189461744829e-06, "loss": 0.6551, "step": 296 }, { "epoch": 2.5384615384615383, "grad_norm": 1.0872114120511283, "learning_rate": 4.654924167200124e-06, "loss": 0.6856, "step": 297 }, { "epoch": 2.547008547008547, "grad_norm": 1.1629580209566888, "learning_rate": 4.652651967437647e-06, "loss": 0.5672, "step": 298 }, { "epoch": 2.5555555555555554, "grad_norm": 1.0863391277267664, "learning_rate": 4.650372869738415e-06, "loss": 0.4294, "step": 299 }, { "epoch": 2.564102564102564, "grad_norm": 1.1561933663636217, "learning_rate": 4.648086881405542e-06, "loss": 0.6269, "step": 300 }, { "epoch": 2.5726495726495724, "grad_norm": 1.0717857311905097, "learning_rate": 4.6457940097642315e-06, "loss": 0.4044, "step": 301 }, { "epoch": 2.5811965811965814, "grad_norm": 1.1707781764907386, "learning_rate": 4.643494262161735e-06, "loss": 0.5534, "step": 302 }, { "epoch": 2.58974358974359, "grad_norm": 1.1549910291770171, "learning_rate": 4.6411876459673435e-06, "loss": 0.6494, "step": 303 }, { "epoch": 2.5982905982905984, "grad_norm": 1.1094722653709168, "learning_rate": 4.638874168572355e-06, "loss": 0.64, "step": 304 }, { "epoch": 2.606837606837607, "grad_norm": 1.207505213396902, "learning_rate": 4.636553837390051e-06, "loss": 0.8608, "step": 305 }, { "epoch": 2.6153846153846154, "grad_norm": 1.045512641873836, "learning_rate": 4.634226659855681e-06, "loss": 0.6105, "step": 306 }, { "epoch": 2.623931623931624, "grad_norm": 0.8967372988993064, "learning_rate": 4.631892643426428e-06, "loss": 0.6645, "step": 307 }, { "epoch": 2.6324786324786325, "grad_norm": 1.0987505726387903, "learning_rate": 4.629551795581393e-06, "loss": 0.5404, "step": 308 }, { "epoch": 2.641025641025641, "grad_norm": 1.1466793203775922, "learning_rate": 4.627204123821563e-06, "loss": 0.7323, "step": 309 }, { "epoch": 2.6495726495726495, "grad_norm": 1.0888693227497082, "learning_rate": 4.624849635669797e-06, "loss": 0.744, "step": 310 }, { "epoch": 2.658119658119658, "grad_norm": 1.094040236503888, "learning_rate": 4.622488338670792e-06, "loss": 0.7397, "step": 311 }, { "epoch": 2.6666666666666665, "grad_norm": 1.133946860080225, "learning_rate": 4.620120240391065e-06, "loss": 0.4986, "step": 312 }, { "epoch": 2.6752136752136755, "grad_norm": 1.0023771702905808, "learning_rate": 4.617745348418928e-06, "loss": 0.8425, "step": 313 }, { "epoch": 2.683760683760684, "grad_norm": 1.0556369869801077, "learning_rate": 4.61536367036446e-06, "loss": 0.5125, "step": 314 }, { "epoch": 2.6923076923076925, "grad_norm": 1.1430379187000481, "learning_rate": 4.612975213859487e-06, "loss": 0.7373, "step": 315 }, { "epoch": 2.700854700854701, "grad_norm": 1.0652237090128354, "learning_rate": 4.6105799865575565e-06, "loss": 0.3925, "step": 316 }, { "epoch": 2.7094017094017095, "grad_norm": 0.9895034294385068, "learning_rate": 4.60817799613391e-06, "loss": 0.5777, "step": 317 }, { "epoch": 2.717948717948718, "grad_norm": 0.9336438508199297, "learning_rate": 4.605769250285462e-06, "loss": 0.6017, "step": 318 }, { "epoch": 2.7264957264957266, "grad_norm": 1.0611915631340922, "learning_rate": 4.603353756730775e-06, "loss": 0.8405, "step": 319 }, { "epoch": 2.735042735042735, "grad_norm": 1.0455572665815593, "learning_rate": 4.600931523210032e-06, "loss": 0.5468, "step": 320 }, { "epoch": 2.7435897435897436, "grad_norm": 0.9999670533186025, "learning_rate": 4.598502557485015e-06, "loss": 0.5728, "step": 321 }, { "epoch": 2.752136752136752, "grad_norm": 1.0769830544040775, "learning_rate": 4.5960668673390776e-06, "loss": 0.7857, "step": 322 }, { "epoch": 2.7606837606837606, "grad_norm": 0.934935206627796, "learning_rate": 4.59362446057712e-06, "loss": 0.6265, "step": 323 }, { "epoch": 2.769230769230769, "grad_norm": 1.0960881582165751, "learning_rate": 4.591175345025567e-06, "loss": 0.4837, "step": 324 }, { "epoch": 2.7777777777777777, "grad_norm": 0.9454844727455721, "learning_rate": 4.588719528532342e-06, "loss": 0.6177, "step": 325 }, { "epoch": 2.786324786324786, "grad_norm": 1.1553344447596179, "learning_rate": 4.586257018966837e-06, "loss": 0.8151, "step": 326 }, { "epoch": 2.7948717948717947, "grad_norm": 1.0896014725475482, "learning_rate": 4.583787824219894e-06, "loss": 0.6745, "step": 327 }, { "epoch": 2.8034188034188032, "grad_norm": 1.1258318879696727, "learning_rate": 4.5813119522037765e-06, "loss": 0.7511, "step": 328 }, { "epoch": 2.8119658119658117, "grad_norm": 1.048893301821594, "learning_rate": 4.578829410852145e-06, "loss": 0.8495, "step": 329 }, { "epoch": 2.8205128205128203, "grad_norm": 1.1543372343743354, "learning_rate": 4.5763402081200295e-06, "loss": 0.52, "step": 330 }, { "epoch": 2.8290598290598292, "grad_norm": 0.9769569666395369, "learning_rate": 4.573844351983807e-06, "loss": 0.5414, "step": 331 }, { "epoch": 2.8376068376068377, "grad_norm": 1.1088566573346965, "learning_rate": 4.571341850441175e-06, "loss": 0.545, "step": 332 }, { "epoch": 2.8461538461538463, "grad_norm": 1.144611010547972, "learning_rate": 4.568832711511125e-06, "loss": 0.6951, "step": 333 }, { "epoch": 2.8547008547008548, "grad_norm": 0.8252577047852332, "learning_rate": 4.566316943233916e-06, "loss": 0.5658, "step": 334 }, { "epoch": 2.8632478632478633, "grad_norm": 0.8953345634595089, "learning_rate": 4.56379455367105e-06, "loss": 0.4984, "step": 335 }, { "epoch": 2.871794871794872, "grad_norm": 1.1004516056302105, "learning_rate": 4.561265550905251e-06, "loss": 0.415, "step": 336 }, { "epoch": 2.8803418803418803, "grad_norm": 1.0095678234487127, "learning_rate": 4.558729943040427e-06, "loss": 0.569, "step": 337 }, { "epoch": 2.888888888888889, "grad_norm": 0.9321134809739067, "learning_rate": 4.556187738201656e-06, "loss": 0.6132, "step": 338 }, { "epoch": 2.8974358974358974, "grad_norm": 1.0238817667487585, "learning_rate": 4.553638944535155e-06, "loss": 0.5555, "step": 339 }, { "epoch": 2.905982905982906, "grad_norm": 1.0428311557389862, "learning_rate": 4.551083570208251e-06, "loss": 0.6989, "step": 340 }, { "epoch": 2.9145299145299144, "grad_norm": 0.9124242004855897, "learning_rate": 4.548521623409364e-06, "loss": 0.47, "step": 341 }, { "epoch": 2.9230769230769234, "grad_norm": 1.2657698218051596, "learning_rate": 4.545953112347967e-06, "loss": 0.6867, "step": 342 }, { "epoch": 2.931623931623932, "grad_norm": 1.1422213876431186, "learning_rate": 4.543378045254575e-06, "loss": 0.5938, "step": 343 }, { "epoch": 2.9401709401709404, "grad_norm": 1.1667272144576992, "learning_rate": 4.540796430380706e-06, "loss": 0.5566, "step": 344 }, { "epoch": 2.948717948717949, "grad_norm": 1.2696464046225642, "learning_rate": 4.538208275998861e-06, "loss": 0.7144, "step": 345 }, { "epoch": 2.9572649572649574, "grad_norm": 1.0653410607116212, "learning_rate": 4.535613590402497e-06, "loss": 0.618, "step": 346 }, { "epoch": 2.965811965811966, "grad_norm": 1.0344003466892562, "learning_rate": 4.533012381905999e-06, "loss": 0.5135, "step": 347 }, { "epoch": 2.9743589743589745, "grad_norm": 1.0045172330544723, "learning_rate": 4.530404658844654e-06, "loss": 0.4556, "step": 348 }, { "epoch": 2.982905982905983, "grad_norm": 1.1125798982264483, "learning_rate": 4.527790429574623e-06, "loss": 0.714, "step": 349 }, { "epoch": 2.9914529914529915, "grad_norm": 1.0281797974354252, "learning_rate": 4.525169702472917e-06, "loss": 0.3698, "step": 350 }, { "epoch": 3.0, "grad_norm": 0.9244672248228908, "learning_rate": 4.522542485937369e-06, "loss": 0.4979, "step": 351 }, { "epoch": 3.0085470085470085, "grad_norm": 0.8672830130925646, "learning_rate": 4.519908788386605e-06, "loss": 0.5368, "step": 352 }, { "epoch": 3.017094017094017, "grad_norm": 0.8961300006092746, "learning_rate": 4.51726861826002e-06, "loss": 0.5831, "step": 353 }, { "epoch": 3.0256410256410255, "grad_norm": 1.055331807150709, "learning_rate": 4.514621984017748e-06, "loss": 0.4283, "step": 354 }, { "epoch": 3.034188034188034, "grad_norm": 0.922217785174904, "learning_rate": 4.511968894140639e-06, "loss": 0.7095, "step": 355 }, { "epoch": 3.0427350427350426, "grad_norm": 1.0000694441929876, "learning_rate": 4.509309357130227e-06, "loss": 0.5385, "step": 356 }, { "epoch": 3.051282051282051, "grad_norm": 0.9230262407432018, "learning_rate": 4.5066433815087076e-06, "loss": 0.4442, "step": 357 }, { "epoch": 3.0598290598290596, "grad_norm": 1.0391084822072165, "learning_rate": 4.503970975818905e-06, "loss": 0.5411, "step": 358 }, { "epoch": 3.0683760683760686, "grad_norm": 1.028051333795015, "learning_rate": 4.501292148624251e-06, "loss": 0.5172, "step": 359 }, { "epoch": 3.076923076923077, "grad_norm": 1.050209054930378, "learning_rate": 4.498606908508754e-06, "loss": 0.5412, "step": 360 }, { "epoch": 3.0854700854700856, "grad_norm": 1.0309848435184052, "learning_rate": 4.495915264076967e-06, "loss": 0.5168, "step": 361 }, { "epoch": 3.094017094017094, "grad_norm": 1.018331740357964, "learning_rate": 4.493217223953974e-06, "loss": 0.5317, "step": 362 }, { "epoch": 3.1025641025641026, "grad_norm": 1.0412643296482653, "learning_rate": 4.490512796785344e-06, "loss": 0.4551, "step": 363 }, { "epoch": 3.111111111111111, "grad_norm": 1.0473001504092734, "learning_rate": 4.48780199123712e-06, "loss": 0.4791, "step": 364 }, { "epoch": 3.1196581196581197, "grad_norm": 1.0841505339438793, "learning_rate": 4.485084815995778e-06, "loss": 0.467, "step": 365 }, { "epoch": 3.128205128205128, "grad_norm": 1.1191014630317282, "learning_rate": 4.482361279768209e-06, "loss": 0.5728, "step": 366 }, { "epoch": 3.1367521367521367, "grad_norm": 1.0474104646663822, "learning_rate": 4.479631391281685e-06, "loss": 0.8182, "step": 367 }, { "epoch": 3.1452991452991452, "grad_norm": 1.109664777957958, "learning_rate": 4.476895159283835e-06, "loss": 0.7313, "step": 368 }, { "epoch": 3.1538461538461537, "grad_norm": 0.9922516229814357, "learning_rate": 4.474152592542613e-06, "loss": 0.5853, "step": 369 }, { "epoch": 3.1623931623931623, "grad_norm": 1.0930241384311647, "learning_rate": 4.4714036998462715e-06, "loss": 0.7107, "step": 370 }, { "epoch": 3.1709401709401708, "grad_norm": 1.1835050143124834, "learning_rate": 4.4686484900033375e-06, "loss": 0.5722, "step": 371 }, { "epoch": 3.1794871794871793, "grad_norm": 0.9587380893066114, "learning_rate": 4.465886971842578e-06, "loss": 0.5348, "step": 372 }, { "epoch": 3.1880341880341883, "grad_norm": 1.069484300863465, "learning_rate": 4.463119154212972e-06, "loss": 0.5701, "step": 373 }, { "epoch": 3.1965811965811968, "grad_norm": 0.9644695900573369, "learning_rate": 4.46034504598369e-06, "loss": 0.5821, "step": 374 }, { "epoch": 3.2051282051282053, "grad_norm": 1.2138323456142817, "learning_rate": 4.457564656044056e-06, "loss": 0.4864, "step": 375 }, { "epoch": 3.213675213675214, "grad_norm": 1.0335758299400877, "learning_rate": 4.454777993303524e-06, "loss": 0.736, "step": 376 }, { "epoch": 3.2222222222222223, "grad_norm": 1.215687453793553, "learning_rate": 4.451985066691649e-06, "loss": 0.4861, "step": 377 }, { "epoch": 3.230769230769231, "grad_norm": 0.9760526489106304, "learning_rate": 4.449185885158056e-06, "loss": 0.473, "step": 378 }, { "epoch": 3.2393162393162394, "grad_norm": 1.025138689320582, "learning_rate": 4.446380457672417e-06, "loss": 0.438, "step": 379 }, { "epoch": 3.247863247863248, "grad_norm": 1.0304858209940453, "learning_rate": 4.443568793224415e-06, "loss": 0.5764, "step": 380 }, { "epoch": 3.2564102564102564, "grad_norm": 1.0641902860180872, "learning_rate": 4.44075090082372e-06, "loss": 0.5768, "step": 381 }, { "epoch": 3.264957264957265, "grad_norm": 1.0141681638042424, "learning_rate": 4.437926789499959e-06, "loss": 0.6206, "step": 382 }, { "epoch": 3.2735042735042734, "grad_norm": 0.9719514259271992, "learning_rate": 4.435096468302687e-06, "loss": 0.3911, "step": 383 }, { "epoch": 3.282051282051282, "grad_norm": 1.0928813834901747, "learning_rate": 4.432259946301355e-06, "loss": 0.5447, "step": 384 }, { "epoch": 3.2905982905982905, "grad_norm": 1.1490777184396737, "learning_rate": 4.429417232585288e-06, "loss": 0.5154, "step": 385 }, { "epoch": 3.299145299145299, "grad_norm": 1.1605361713285143, "learning_rate": 4.42656833626365e-06, "loss": 0.379, "step": 386 }, { "epoch": 3.3076923076923075, "grad_norm": 1.1628214583644674, "learning_rate": 4.423713266465415e-06, "loss": 0.5248, "step": 387 }, { "epoch": 3.316239316239316, "grad_norm": 1.1349331779893848, "learning_rate": 4.4208520323393425e-06, "loss": 0.4598, "step": 388 }, { "epoch": 3.324786324786325, "grad_norm": 1.1043916941033478, "learning_rate": 4.417984643053941e-06, "loss": 0.744, "step": 389 }, { "epoch": 3.3333333333333335, "grad_norm": 0.9604478327433834, "learning_rate": 4.415111107797445e-06, "loss": 0.5136, "step": 390 }, { "epoch": 3.341880341880342, "grad_norm": 1.280358040612093, "learning_rate": 4.412231435777784e-06, "loss": 0.5602, "step": 391 }, { "epoch": 3.3504273504273505, "grad_norm": 1.055925678568642, "learning_rate": 4.409345636222549e-06, "loss": 0.5936, "step": 392 }, { "epoch": 3.358974358974359, "grad_norm": 1.0454713545217449, "learning_rate": 4.406453718378968e-06, "loss": 0.7998, "step": 393 }, { "epoch": 3.3675213675213675, "grad_norm": 1.0350994528202249, "learning_rate": 4.4035556915138745e-06, "loss": 0.5932, "step": 394 }, { "epoch": 3.376068376068376, "grad_norm": 1.0672167525817227, "learning_rate": 4.400651564913676e-06, "loss": 0.6762, "step": 395 }, { "epoch": 3.3846153846153846, "grad_norm": 1.0783182827330902, "learning_rate": 4.397741347884329e-06, "loss": 0.623, "step": 396 }, { "epoch": 3.393162393162393, "grad_norm": 1.1809114761670625, "learning_rate": 4.394825049751303e-06, "loss": 0.7212, "step": 397 }, { "epoch": 3.4017094017094016, "grad_norm": 1.2022044534831018, "learning_rate": 4.391902679859557e-06, "loss": 0.3709, "step": 398 }, { "epoch": 3.41025641025641, "grad_norm": 1.1287039650189177, "learning_rate": 4.388974247573501e-06, "loss": 0.5523, "step": 399 }, { "epoch": 3.4188034188034186, "grad_norm": 1.1605193293531824, "learning_rate": 4.386039762276976e-06, "loss": 0.556, "step": 400 }, { "epoch": 3.427350427350427, "grad_norm": 1.127095779493485, "learning_rate": 4.3830992333732185e-06, "loss": 0.5659, "step": 401 }, { "epoch": 3.435897435897436, "grad_norm": 1.0387250899849618, "learning_rate": 4.3801526702848306e-06, "loss": 0.632, "step": 402 }, { "epoch": 3.4444444444444446, "grad_norm": 0.9521939601614868, "learning_rate": 4.377200082453748e-06, "loss": 0.4848, "step": 403 }, { "epoch": 3.452991452991453, "grad_norm": 1.0609804557798395, "learning_rate": 4.374241479341216e-06, "loss": 0.4597, "step": 404 }, { "epoch": 3.4615384615384617, "grad_norm": 1.122757058652312, "learning_rate": 4.3712768704277535e-06, "loss": 0.5509, "step": 405 }, { "epoch": 3.47008547008547, "grad_norm": 1.2171462597861757, "learning_rate": 4.368306265213122e-06, "loss": 0.5441, "step": 406 }, { "epoch": 3.4786324786324787, "grad_norm": 1.1815243719579964, "learning_rate": 4.365329673216301e-06, "loss": 0.4705, "step": 407 }, { "epoch": 3.4871794871794872, "grad_norm": 1.0455156764851956, "learning_rate": 4.3623471039754525e-06, "loss": 0.6764, "step": 408 }, { "epoch": 3.4957264957264957, "grad_norm": 1.0444224354943048, "learning_rate": 4.359358567047892e-06, "loss": 0.5449, "step": 409 }, { "epoch": 3.5042735042735043, "grad_norm": 0.9079323012608914, "learning_rate": 4.356364072010059e-06, "loss": 0.6122, "step": 410 }, { "epoch": 3.5128205128205128, "grad_norm": 1.1994820113729636, "learning_rate": 4.35336362845748e-06, "loss": 0.9035, "step": 411 }, { "epoch": 3.5213675213675213, "grad_norm": 1.0572483323899298, "learning_rate": 4.35035724600475e-06, "loss": 0.7419, "step": 412 }, { "epoch": 3.52991452991453, "grad_norm": 1.0994551505656394, "learning_rate": 4.347344934285492e-06, "loss": 0.7047, "step": 413 }, { "epoch": 3.5384615384615383, "grad_norm": 1.2445047273471745, "learning_rate": 4.3443267029523265e-06, "loss": 0.6336, "step": 414 }, { "epoch": 3.547008547008547, "grad_norm": 0.9353142553554201, "learning_rate": 4.3413025616768426e-06, "loss": 0.5763, "step": 415 }, { "epoch": 3.5555555555555554, "grad_norm": 1.0815380950652063, "learning_rate": 4.338272520149572e-06, "loss": 0.5757, "step": 416 }, { "epoch": 3.564102564102564, "grad_norm": 1.0211900635168667, "learning_rate": 4.335236588079949e-06, "loss": 0.5728, "step": 417 }, { "epoch": 3.5726495726495724, "grad_norm": 1.0205341832766097, "learning_rate": 4.332194775196282e-06, "loss": 0.6203, "step": 418 }, { "epoch": 3.5811965811965814, "grad_norm": 1.0250376149242053, "learning_rate": 4.329147091245729e-06, "loss": 0.4802, "step": 419 }, { "epoch": 3.58974358974359, "grad_norm": 1.0498223443754728, "learning_rate": 4.326093545994258e-06, "loss": 0.4724, "step": 420 }, { "epoch": 3.5982905982905984, "grad_norm": 1.126184117542039, "learning_rate": 4.3230341492266195e-06, "loss": 0.5114, "step": 421 }, { "epoch": 3.606837606837607, "grad_norm": 1.0434562063600874, "learning_rate": 4.3199689107463125e-06, "loss": 0.4119, "step": 422 }, { "epoch": 3.6153846153846154, "grad_norm": 1.0248635506729542, "learning_rate": 4.316897840375558e-06, "loss": 0.5205, "step": 423 }, { "epoch": 3.623931623931624, "grad_norm": 1.0077448554942794, "learning_rate": 4.313820947955265e-06, "loss": 0.4752, "step": 424 }, { "epoch": 3.6324786324786325, "grad_norm": 1.031230850742542, "learning_rate": 4.310738243344996e-06, "loss": 0.4966, "step": 425 }, { "epoch": 3.641025641025641, "grad_norm": 1.1227610381702842, "learning_rate": 4.307649736422939e-06, "loss": 0.3805, "step": 426 }, { "epoch": 3.6495726495726495, "grad_norm": 1.1094301736827121, "learning_rate": 4.304555437085876e-06, "loss": 0.5154, "step": 427 }, { "epoch": 3.658119658119658, "grad_norm": 1.1641744131192364, "learning_rate": 4.301455355249148e-06, "loss": 0.6265, "step": 428 }, { "epoch": 3.6666666666666665, "grad_norm": 1.3182527561357478, "learning_rate": 4.2983495008466285e-06, "loss": 0.5251, "step": 429 }, { "epoch": 3.6752136752136755, "grad_norm": 1.057755716798989, "learning_rate": 4.2952378838306855e-06, "loss": 0.549, "step": 430 }, { "epoch": 3.683760683760684, "grad_norm": 1.1167572928661555, "learning_rate": 4.292120514172154e-06, "loss": 0.5538, "step": 431 }, { "epoch": 3.6923076923076925, "grad_norm": 1.0189034773737762, "learning_rate": 4.288997401860303e-06, "loss": 0.4497, "step": 432 }, { "epoch": 3.700854700854701, "grad_norm": 1.1052813795847365, "learning_rate": 4.285868556902803e-06, "loss": 0.5662, "step": 433 }, { "epoch": 3.7094017094017095, "grad_norm": 1.1983962217285142, "learning_rate": 4.2827339893256935e-06, "loss": 0.5738, "step": 434 }, { "epoch": 3.717948717948718, "grad_norm": 1.217707027286218, "learning_rate": 4.279593709173352e-06, "loss": 0.583, "step": 435 }, { "epoch": 3.7264957264957266, "grad_norm": 1.0467283527547593, "learning_rate": 4.276447726508461e-06, "loss": 0.6682, "step": 436 }, { "epoch": 3.735042735042735, "grad_norm": 1.0574228677466284, "learning_rate": 4.273296051411978e-06, "loss": 0.6307, "step": 437 }, { "epoch": 3.7435897435897436, "grad_norm": 1.0940623257733564, "learning_rate": 4.2701386939830966e-06, "loss": 0.7574, "step": 438 }, { "epoch": 3.752136752136752, "grad_norm": 1.2037642109054294, "learning_rate": 4.2669756643392255e-06, "loss": 0.744, "step": 439 }, { "epoch": 3.7606837606837606, "grad_norm": 1.0771890780466298, "learning_rate": 4.263806972615943e-06, "loss": 0.3926, "step": 440 }, { "epoch": 3.769230769230769, "grad_norm": 1.1140250234730074, "learning_rate": 4.260632628966974e-06, "loss": 0.5387, "step": 441 }, { "epoch": 3.7777777777777777, "grad_norm": 1.0216104979859169, "learning_rate": 4.257452643564155e-06, "loss": 0.4784, "step": 442 }, { "epoch": 3.786324786324786, "grad_norm": 1.031163827042152, "learning_rate": 4.254267026597399e-06, "loss": 0.6873, "step": 443 }, { "epoch": 3.7948717948717947, "grad_norm": 1.182047554124205, "learning_rate": 4.251075788274667e-06, "loss": 0.306, "step": 444 }, { "epoch": 3.8034188034188032, "grad_norm": 1.1464751159617779, "learning_rate": 4.247878938821929e-06, "loss": 0.5585, "step": 445 }, { "epoch": 3.8119658119658117, "grad_norm": 1.0901425013917103, "learning_rate": 4.2446764884831404e-06, "loss": 0.6373, "step": 446 }, { "epoch": 3.8205128205128203, "grad_norm": 1.1012303502937792, "learning_rate": 4.2414684475202014e-06, "loss": 0.554, "step": 447 }, { "epoch": 3.8290598290598292, "grad_norm": 1.141830081568288, "learning_rate": 4.238254826212925e-06, "loss": 0.6192, "step": 448 }, { "epoch": 3.8376068376068377, "grad_norm": 1.11435215769542, "learning_rate": 4.2350356348590096e-06, "loss": 0.5138, "step": 449 }, { "epoch": 3.8461538461538463, "grad_norm": 1.0466376945625686, "learning_rate": 4.231810883773999e-06, "loss": 0.6832, "step": 450 }, { "epoch": 3.8547008547008548, "grad_norm": 1.1772526601190012, "learning_rate": 4.228580583291254e-06, "loss": 0.5216, "step": 451 }, { "epoch": 3.8632478632478633, "grad_norm": 1.0414249916352694, "learning_rate": 4.225344743761918e-06, "loss": 0.5568, "step": 452 }, { "epoch": 3.871794871794872, "grad_norm": 1.135999207400427, "learning_rate": 4.2221033755548835e-06, "loss": 0.4843, "step": 453 }, { "epoch": 3.8803418803418803, "grad_norm": 1.1714177527617442, "learning_rate": 4.218856489056758e-06, "loss": 0.3179, "step": 454 }, { "epoch": 3.888888888888889, "grad_norm": 0.9729054252414072, "learning_rate": 4.215604094671835e-06, "loss": 0.6261, "step": 455 }, { "epoch": 3.8974358974358974, "grad_norm": 1.066184341545607, "learning_rate": 4.2123462028220505e-06, "loss": 0.6724, "step": 456 }, { "epoch": 3.905982905982906, "grad_norm": 1.230748130789342, "learning_rate": 4.209082823946965e-06, "loss": 0.8234, "step": 457 }, { "epoch": 3.9145299145299144, "grad_norm": 1.1988530892117408, "learning_rate": 4.205813968503717e-06, "loss": 0.6273, "step": 458 }, { "epoch": 3.9230769230769234, "grad_norm": 1.0786203487111534, "learning_rate": 4.202539646966993e-06, "loss": 0.5313, "step": 459 }, { "epoch": 3.931623931623932, "grad_norm": 0.9160073002850568, "learning_rate": 4.1992598698289985e-06, "loss": 0.3638, "step": 460 }, { "epoch": 3.9401709401709404, "grad_norm": 1.1479053266981039, "learning_rate": 4.1959746475994175e-06, "loss": 0.6149, "step": 461 }, { "epoch": 3.948717948717949, "grad_norm": 0.9316297041443543, "learning_rate": 4.1926839908053855e-06, "loss": 0.5541, "step": 462 }, { "epoch": 3.9572649572649574, "grad_norm": 1.0830846381257961, "learning_rate": 4.189387909991448e-06, "loss": 0.5234, "step": 463 }, { "epoch": 3.965811965811966, "grad_norm": 1.1678471598237286, "learning_rate": 4.186086415719537e-06, "loss": 0.5652, "step": 464 }, { "epoch": 3.9743589743589745, "grad_norm": 0.9099147923682155, "learning_rate": 4.182779518568925e-06, "loss": 0.5946, "step": 465 }, { "epoch": 3.982905982905983, "grad_norm": 1.1141807669712536, "learning_rate": 4.179467229136205e-06, "loss": 0.3703, "step": 466 }, { "epoch": 3.9914529914529915, "grad_norm": 1.4764971597166958, "learning_rate": 4.176149558035241e-06, "loss": 0.6054, "step": 467 }, { "epoch": 4.0, "grad_norm": 1.22721031943352, "learning_rate": 4.172826515897146e-06, "loss": 0.4847, "step": 468 }, { "epoch": 4.0085470085470085, "grad_norm": 0.875230516002503, "learning_rate": 4.169498113370245e-06, "loss": 0.6291, "step": 469 }, { "epoch": 4.017094017094017, "grad_norm": 1.272482921335239, "learning_rate": 4.166164361120036e-06, "loss": 0.3956, "step": 470 }, { "epoch": 4.0256410256410255, "grad_norm": 0.9556535023235093, "learning_rate": 4.162825269829165e-06, "loss": 0.5645, "step": 471 }, { "epoch": 4.034188034188034, "grad_norm": 0.9656405185189749, "learning_rate": 4.15948085019738e-06, "loss": 0.3914, "step": 472 }, { "epoch": 4.042735042735043, "grad_norm": 1.2321645936844894, "learning_rate": 4.156131112941509e-06, "loss": 0.588, "step": 473 }, { "epoch": 4.051282051282051, "grad_norm": 1.367549160162955, "learning_rate": 4.152776068795416e-06, "loss": 0.6519, "step": 474 }, { "epoch": 4.05982905982906, "grad_norm": 0.9481306468484307, "learning_rate": 4.149415728509971e-06, "loss": 0.5701, "step": 475 }, { "epoch": 4.068376068376068, "grad_norm": 1.1701525380439926, "learning_rate": 4.146050102853015e-06, "loss": 0.4844, "step": 476 }, { "epoch": 4.076923076923077, "grad_norm": 1.1247319279211934, "learning_rate": 4.1426792026093274e-06, "loss": 0.7878, "step": 477 }, { "epoch": 4.085470085470085, "grad_norm": 1.1237398995766223, "learning_rate": 4.139303038580586e-06, "loss": 0.4843, "step": 478 }, { "epoch": 4.094017094017094, "grad_norm": 1.0625870199426208, "learning_rate": 4.135921621585338e-06, "loss": 0.3677, "step": 479 }, { "epoch": 4.102564102564102, "grad_norm": 1.14727227991621, "learning_rate": 4.1325349624589625e-06, "loss": 0.6442, "step": 480 }, { "epoch": 4.111111111111111, "grad_norm": 1.0477961628751995, "learning_rate": 4.129143072053639e-06, "loss": 0.5369, "step": 481 }, { "epoch": 4.119658119658119, "grad_norm": 1.087010322607983, "learning_rate": 4.125745961238305e-06, "loss": 0.392, "step": 482 }, { "epoch": 4.128205128205128, "grad_norm": 1.2603627244731999, "learning_rate": 4.122343640898628e-06, "loss": 0.5996, "step": 483 }, { "epoch": 4.136752136752137, "grad_norm": 1.1308786469388739, "learning_rate": 4.118936121936973e-06, "loss": 0.6201, "step": 484 }, { "epoch": 4.145299145299146, "grad_norm": 0.9734248346069477, "learning_rate": 4.115523415272358e-06, "loss": 0.4135, "step": 485 }, { "epoch": 4.153846153846154, "grad_norm": 1.2509401315135145, "learning_rate": 4.112105531840427e-06, "loss": 0.7628, "step": 486 }, { "epoch": 4.162393162393163, "grad_norm": 1.2324367647560945, "learning_rate": 4.1086824825934126e-06, "loss": 0.4189, "step": 487 }, { "epoch": 4.170940170940171, "grad_norm": 1.5871825728125233, "learning_rate": 4.1052542785001e-06, "loss": 0.4659, "step": 488 }, { "epoch": 4.17948717948718, "grad_norm": 1.2427938238759526, "learning_rate": 4.101820930545792e-06, "loss": 0.5776, "step": 489 }, { "epoch": 4.188034188034188, "grad_norm": 1.1040081714838972, "learning_rate": 4.098382449732276e-06, "loss": 0.5746, "step": 490 }, { "epoch": 4.196581196581197, "grad_norm": 1.399147839889112, "learning_rate": 4.094938847077784e-06, "loss": 0.7351, "step": 491 }, { "epoch": 4.205128205128205, "grad_norm": 1.2039881642302797, "learning_rate": 4.091490133616965e-06, "loss": 0.3633, "step": 492 }, { "epoch": 4.213675213675214, "grad_norm": 1.363284686280309, "learning_rate": 4.08803632040084e-06, "loss": 0.4315, "step": 493 }, { "epoch": 4.222222222222222, "grad_norm": 1.0603492653742104, "learning_rate": 4.084577418496775e-06, "loss": 0.4569, "step": 494 }, { "epoch": 4.230769230769231, "grad_norm": 1.103163024491336, "learning_rate": 4.081113438988443e-06, "loss": 0.4104, "step": 495 }, { "epoch": 4.239316239316239, "grad_norm": 1.0528872661282709, "learning_rate": 4.077644392975785e-06, "loss": 0.3354, "step": 496 }, { "epoch": 4.247863247863248, "grad_norm": 1.2879686313297776, "learning_rate": 4.074170291574975e-06, "loss": 0.5563, "step": 497 }, { "epoch": 4.256410256410256, "grad_norm": 1.045397447019931, "learning_rate": 4.0706911459183915e-06, "loss": 0.5623, "step": 498 }, { "epoch": 4.264957264957265, "grad_norm": 0.9731163261614564, "learning_rate": 4.067206967154575e-06, "loss": 0.4018, "step": 499 }, { "epoch": 4.273504273504273, "grad_norm": 1.0969564649833976, "learning_rate": 4.063717766448194e-06, "loss": 0.4298, "step": 500 }, { "epoch": 4.282051282051282, "grad_norm": 1.0639228552178774, "learning_rate": 4.060223554980007e-06, "loss": 0.5563, "step": 501 }, { "epoch": 4.2905982905982905, "grad_norm": 1.2587006164595462, "learning_rate": 4.056724343946832e-06, "loss": 0.3347, "step": 502 }, { "epoch": 4.299145299145299, "grad_norm": 1.076742709469613, "learning_rate": 4.053220144561506e-06, "loss": 0.3666, "step": 503 }, { "epoch": 4.3076923076923075, "grad_norm": 1.1869023630845126, "learning_rate": 4.049710968052851e-06, "loss": 0.5493, "step": 504 }, { "epoch": 4.316239316239316, "grad_norm": 1.2762763414877694, "learning_rate": 4.046196825665638e-06, "loss": 0.4113, "step": 505 }, { "epoch": 4.3247863247863245, "grad_norm": 1.256286291317964, "learning_rate": 4.042677728660549e-06, "loss": 0.3814, "step": 506 }, { "epoch": 4.333333333333333, "grad_norm": 1.2060736250753987, "learning_rate": 4.039153688314146e-06, "loss": 0.5914, "step": 507 }, { "epoch": 4.3418803418803416, "grad_norm": 1.1176723905627919, "learning_rate": 4.035624715918827e-06, "loss": 0.6334, "step": 508 }, { "epoch": 4.35042735042735, "grad_norm": 1.5033666933742025, "learning_rate": 4.032090822782798e-06, "loss": 0.4644, "step": 509 }, { "epoch": 4.358974358974359, "grad_norm": 1.1687610354628746, "learning_rate": 4.028552020230031e-06, "loss": 0.4106, "step": 510 }, { "epoch": 4.367521367521368, "grad_norm": 1.2015668421384156, "learning_rate": 4.0250083196002285e-06, "loss": 0.5407, "step": 511 }, { "epoch": 4.3760683760683765, "grad_norm": 1.1147036921086226, "learning_rate": 4.021459732248792e-06, "loss": 0.358, "step": 512 }, { "epoch": 4.384615384615385, "grad_norm": 1.5367915145923337, "learning_rate": 4.017906269546778e-06, "loss": 0.4539, "step": 513 }, { "epoch": 4.3931623931623935, "grad_norm": 1.304473709441864, "learning_rate": 4.014347942880869e-06, "loss": 0.4259, "step": 514 }, { "epoch": 4.401709401709402, "grad_norm": 1.1833828587337736, "learning_rate": 4.0107847636533314e-06, "loss": 0.4551, "step": 515 }, { "epoch": 4.410256410256411, "grad_norm": 0.9701845087775417, "learning_rate": 4.0072167432819804e-06, "loss": 0.3402, "step": 516 }, { "epoch": 4.418803418803419, "grad_norm": 1.3429029717262404, "learning_rate": 4.003643893200148e-06, "loss": 0.5363, "step": 517 }, { "epoch": 4.427350427350428, "grad_norm": 1.4813898446999763, "learning_rate": 4.000066224856636e-06, "loss": 0.6563, "step": 518 }, { "epoch": 4.435897435897436, "grad_norm": 1.0620532686129334, "learning_rate": 3.996483749715694e-06, "loss": 0.5307, "step": 519 }, { "epoch": 4.444444444444445, "grad_norm": 1.1674492551898705, "learning_rate": 3.992896479256966e-06, "loss": 0.5044, "step": 520 }, { "epoch": 4.452991452991453, "grad_norm": 1.089978982595774, "learning_rate": 3.989304424975468e-06, "loss": 0.4327, "step": 521 }, { "epoch": 4.461538461538462, "grad_norm": 1.0065363497501418, "learning_rate": 3.985707598381544e-06, "loss": 0.55, "step": 522 }, { "epoch": 4.47008547008547, "grad_norm": 1.0697673878641412, "learning_rate": 3.9821060110008295e-06, "loss": 0.5825, "step": 523 }, { "epoch": 4.478632478632479, "grad_norm": 1.1226330146224837, "learning_rate": 3.978499674374214e-06, "loss": 0.5514, "step": 524 }, { "epoch": 4.487179487179487, "grad_norm": 1.178704468790716, "learning_rate": 3.974888600057808e-06, "loss": 0.4042, "step": 525 }, { "epoch": 4.495726495726496, "grad_norm": 1.1844014873988862, "learning_rate": 3.971272799622903e-06, "loss": 0.6905, "step": 526 }, { "epoch": 4.504273504273504, "grad_norm": 1.3063239169598142, "learning_rate": 3.967652284655933e-06, "loss": 0.464, "step": 527 }, { "epoch": 4.512820512820513, "grad_norm": 1.3805070176396315, "learning_rate": 3.964027066758442e-06, "loss": 0.3515, "step": 528 }, { "epoch": 4.521367521367521, "grad_norm": 1.0907984273825841, "learning_rate": 3.960397157547043e-06, "loss": 0.528, "step": 529 }, { "epoch": 4.52991452991453, "grad_norm": 1.2859054101894118, "learning_rate": 3.956762568653378e-06, "loss": 0.598, "step": 530 }, { "epoch": 4.538461538461538, "grad_norm": 1.2422004006544314, "learning_rate": 3.953123311724092e-06, "loss": 0.6026, "step": 531 }, { "epoch": 4.547008547008547, "grad_norm": 1.2436712062281594, "learning_rate": 3.9494793984207815e-06, "loss": 0.358, "step": 532 }, { "epoch": 4.555555555555555, "grad_norm": 1.117015372814124, "learning_rate": 3.945830840419966e-06, "loss": 0.5461, "step": 533 }, { "epoch": 4.564102564102564, "grad_norm": 1.0053532044910118, "learning_rate": 3.942177649413051e-06, "loss": 0.5622, "step": 534 }, { "epoch": 4.572649572649572, "grad_norm": 1.2133323539666498, "learning_rate": 3.938519837106284e-06, "loss": 0.3818, "step": 535 }, { "epoch": 4.581196581196581, "grad_norm": 1.3183491629414597, "learning_rate": 3.9348574152207245e-06, "loss": 0.42, "step": 536 }, { "epoch": 4.589743589743589, "grad_norm": 1.0855334417363243, "learning_rate": 3.931190395492198e-06, "loss": 0.4367, "step": 537 }, { "epoch": 4.598290598290598, "grad_norm": 1.1830080951781239, "learning_rate": 3.92751878967127e-06, "loss": 0.5654, "step": 538 }, { "epoch": 4.6068376068376065, "grad_norm": 1.0025162147085154, "learning_rate": 3.923842609523195e-06, "loss": 0.5085, "step": 539 }, { "epoch": 4.615384615384615, "grad_norm": 1.1938032588726277, "learning_rate": 3.92016186682789e-06, "loss": 0.3901, "step": 540 }, { "epoch": 4.6239316239316235, "grad_norm": 1.044455553238553, "learning_rate": 3.91647657337989e-06, "loss": 0.5414, "step": 541 }, { "epoch": 4.632478632478632, "grad_norm": 1.0604389385547883, "learning_rate": 3.9127867409883145e-06, "loss": 0.5697, "step": 542 }, { "epoch": 4.641025641025641, "grad_norm": 1.030741191950751, "learning_rate": 3.909092381476824e-06, "loss": 0.4111, "step": 543 }, { "epoch": 4.64957264957265, "grad_norm": 1.2599857607075067, "learning_rate": 3.905393506683589e-06, "loss": 0.7456, "step": 544 }, { "epoch": 4.6581196581196584, "grad_norm": 1.1979763813132545, "learning_rate": 3.901690128461248e-06, "loss": 0.4687, "step": 545 }, { "epoch": 4.666666666666667, "grad_norm": 1.4424908787398834, "learning_rate": 3.897982258676867e-06, "loss": 0.6179, "step": 546 }, { "epoch": 4.6752136752136755, "grad_norm": 0.9608243421216695, "learning_rate": 3.894269909211911e-06, "loss": 0.5166, "step": 547 }, { "epoch": 4.683760683760684, "grad_norm": 1.2658167795968152, "learning_rate": 3.890553091962193e-06, "loss": 0.4927, "step": 548 }, { "epoch": 4.6923076923076925, "grad_norm": 1.4297839299427377, "learning_rate": 3.8868318188378475e-06, "loss": 0.3448, "step": 549 }, { "epoch": 4.700854700854701, "grad_norm": 1.1120064739717215, "learning_rate": 3.883106101763285e-06, "loss": 0.4685, "step": 550 }, { "epoch": 4.7094017094017095, "grad_norm": 1.093805953026953, "learning_rate": 3.879375952677156e-06, "loss": 0.4474, "step": 551 }, { "epoch": 4.717948717948718, "grad_norm": 1.0474731400789823, "learning_rate": 3.875641383532313e-06, "loss": 0.5464, "step": 552 }, { "epoch": 4.726495726495727, "grad_norm": 1.1262315965779182, "learning_rate": 3.871902406295775e-06, "loss": 0.4222, "step": 553 }, { "epoch": 4.735042735042735, "grad_norm": 0.964350644928307, "learning_rate": 3.868159032948681e-06, "loss": 0.4808, "step": 554 }, { "epoch": 4.743589743589744, "grad_norm": 1.0561833255928614, "learning_rate": 3.8644112754862614e-06, "loss": 0.6094, "step": 555 }, { "epoch": 4.752136752136752, "grad_norm": 1.0733675088449695, "learning_rate": 3.860659145917794e-06, "loss": 0.5114, "step": 556 }, { "epoch": 4.760683760683761, "grad_norm": 1.0653535243140473, "learning_rate": 3.856902656266563e-06, "loss": 0.4611, "step": 557 }, { "epoch": 4.769230769230769, "grad_norm": 1.1566280687891097, "learning_rate": 3.853141818569829e-06, "loss": 0.5141, "step": 558 }, { "epoch": 4.777777777777778, "grad_norm": 1.2272432059639735, "learning_rate": 3.849376644878783e-06, "loss": 0.4836, "step": 559 }, { "epoch": 4.786324786324786, "grad_norm": 1.0866113784148257, "learning_rate": 3.84560714725851e-06, "loss": 0.5022, "step": 560 }, { "epoch": 4.794871794871795, "grad_norm": 0.9904138308671785, "learning_rate": 3.841833337787951e-06, "loss": 0.5167, "step": 561 }, { "epoch": 4.803418803418803, "grad_norm": 1.2968094227448181, "learning_rate": 3.838055228559864e-06, "loss": 0.4508, "step": 562 }, { "epoch": 4.811965811965812, "grad_norm": 1.214225199676867, "learning_rate": 3.834272831680785e-06, "loss": 0.575, "step": 563 }, { "epoch": 4.82051282051282, "grad_norm": 1.2472021018422048, "learning_rate": 3.830486159270991e-06, "loss": 0.6268, "step": 564 }, { "epoch": 4.829059829059829, "grad_norm": 0.9172374646659589, "learning_rate": 3.826695223464455e-06, "loss": 0.6032, "step": 565 }, { "epoch": 4.837606837606837, "grad_norm": 1.0304247295278326, "learning_rate": 3.822900036408815e-06, "loss": 0.6203, "step": 566 }, { "epoch": 4.846153846153846, "grad_norm": 1.0666290948925694, "learning_rate": 3.819100610265332e-06, "loss": 0.4967, "step": 567 }, { "epoch": 4.854700854700854, "grad_norm": 1.2495026717724254, "learning_rate": 3.815296957208849e-06, "loss": 0.4664, "step": 568 }, { "epoch": 4.863247863247864, "grad_norm": 1.1007215063135107, "learning_rate": 3.811489089427756e-06, "loss": 0.51, "step": 569 }, { "epoch": 4.871794871794872, "grad_norm": 1.2148804714926256, "learning_rate": 3.8076770191239444e-06, "loss": 0.8572, "step": 570 }, { "epoch": 4.880341880341881, "grad_norm": 1.0075512142961376, "learning_rate": 3.8038607585127762e-06, "loss": 0.7746, "step": 571 }, { "epoch": 4.888888888888889, "grad_norm": 1.1675958255183994, "learning_rate": 3.8000403198230385e-06, "loss": 0.4149, "step": 572 }, { "epoch": 4.897435897435898, "grad_norm": 2.3479618762202166, "learning_rate": 3.7962157152969093e-06, "loss": 0.3843, "step": 573 }, { "epoch": 4.905982905982906, "grad_norm": 1.339128802411911, "learning_rate": 3.7923869571899115e-06, "loss": 0.4443, "step": 574 }, { "epoch": 4.914529914529915, "grad_norm": 1.2598653276333198, "learning_rate": 3.7885540577708806e-06, "loss": 0.599, "step": 575 }, { "epoch": 4.923076923076923, "grad_norm": 1.273835748560621, "learning_rate": 3.7847170293219223e-06, "loss": 0.5045, "step": 576 }, { "epoch": 4.931623931623932, "grad_norm": 1.1962056104312029, "learning_rate": 3.780875884138372e-06, "loss": 0.3395, "step": 577 }, { "epoch": 4.94017094017094, "grad_norm": 0.9688429291410264, "learning_rate": 3.7770306345287577e-06, "loss": 0.5456, "step": 578 }, { "epoch": 4.948717948717949, "grad_norm": 1.0556261135022196, "learning_rate": 3.7731812928147593e-06, "loss": 0.4656, "step": 579 }, { "epoch": 4.957264957264957, "grad_norm": 1.172036901000504, "learning_rate": 3.76932787133117e-06, "loss": 0.4111, "step": 580 }, { "epoch": 4.965811965811966, "grad_norm": 1.1936432934962213, "learning_rate": 3.7654703824258544e-06, "loss": 0.511, "step": 581 }, { "epoch": 4.9743589743589745, "grad_norm": 1.0927271152964717, "learning_rate": 3.7616088384597138e-06, "loss": 0.7687, "step": 582 }, { "epoch": 4.982905982905983, "grad_norm": 1.0498423497144054, "learning_rate": 3.757743251806639e-06, "loss": 0.522, "step": 583 }, { "epoch": 4.9914529914529915, "grad_norm": 1.0465049477381851, "learning_rate": 3.753873634853481e-06, "loss": 0.5185, "step": 584 }, { "epoch": 5.0, "grad_norm": 1.1433429860697957, "learning_rate": 3.7500000000000005e-06, "loss": 0.4767, "step": 585 }, { "epoch": 5.0085470085470085, "grad_norm": 1.1929331950145705, "learning_rate": 3.746122359658834e-06, "loss": 0.4947, "step": 586 }, { "epoch": 5.017094017094017, "grad_norm": 1.053420754339069, "learning_rate": 3.7422407262554567e-06, "loss": 0.7168, "step": 587 }, { "epoch": 5.0256410256410255, "grad_norm": 1.0144269826464443, "learning_rate": 3.738355112228134e-06, "loss": 0.3795, "step": 588 }, { "epoch": 5.034188034188034, "grad_norm": 1.1279663595509448, "learning_rate": 3.7344655300278887e-06, "loss": 0.6969, "step": 589 }, { "epoch": 5.042735042735043, "grad_norm": 1.2332439824894945, "learning_rate": 3.7305719921184626e-06, "loss": 0.4497, "step": 590 }, { "epoch": 5.051282051282051, "grad_norm": 1.2073548060349433, "learning_rate": 3.7266745109762668e-06, "loss": 0.3697, "step": 591 }, { "epoch": 5.05982905982906, "grad_norm": 1.3921748253309347, "learning_rate": 3.7227730990903556e-06, "loss": 0.6594, "step": 592 }, { "epoch": 5.068376068376068, "grad_norm": 1.6701256777936668, "learning_rate": 3.718867768962371e-06, "loss": 0.5206, "step": 593 }, { "epoch": 5.076923076923077, "grad_norm": 1.1440994803121367, "learning_rate": 3.714958533106515e-06, "loss": 0.5752, "step": 594 }, { "epoch": 5.085470085470085, "grad_norm": 1.224808376797521, "learning_rate": 3.711045404049507e-06, "loss": 0.5055, "step": 595 }, { "epoch": 5.094017094017094, "grad_norm": 1.7672289430629553, "learning_rate": 3.7071283943305367e-06, "loss": 0.6325, "step": 596 }, { "epoch": 5.102564102564102, "grad_norm": 1.3100332567694668, "learning_rate": 3.7032075165012323e-06, "loss": 0.4567, "step": 597 }, { "epoch": 5.111111111111111, "grad_norm": 1.094017044371432, "learning_rate": 3.699282783125616e-06, "loss": 0.622, "step": 598 }, { "epoch": 5.119658119658119, "grad_norm": 16.34482765121646, "learning_rate": 3.6953542067800647e-06, "loss": 0.5973, "step": 599 }, { "epoch": 5.128205128205128, "grad_norm": 2.1578147738458924, "learning_rate": 3.6914218000532697e-06, "loss": 0.3968, "step": 600 }, { "epoch": 5.136752136752137, "grad_norm": 1.440300424288331, "learning_rate": 3.6874855755461975e-06, "loss": 0.403, "step": 601 }, { "epoch": 5.145299145299146, "grad_norm": 1.4169316737597903, "learning_rate": 3.683545545872045e-06, "loss": 0.4174, "step": 602 }, { "epoch": 5.153846153846154, "grad_norm": 1.3735373305878908, "learning_rate": 3.679601723656205e-06, "loss": 0.534, "step": 603 }, { "epoch": 5.162393162393163, "grad_norm": 1.0510963346680822, "learning_rate": 3.675654121536225e-06, "loss": 0.3353, "step": 604 }, { "epoch": 5.170940170940171, "grad_norm": 1.1659301532257373, "learning_rate": 3.6717027521617593e-06, "loss": 0.6423, "step": 605 }, { "epoch": 5.17948717948718, "grad_norm": 1.1710337548444323, "learning_rate": 3.667747628194539e-06, "loss": 0.4896, "step": 606 }, { "epoch": 5.188034188034188, "grad_norm": 0.981800121939619, "learning_rate": 3.6637887623083235e-06, "loss": 0.4531, "step": 607 }, { "epoch": 5.196581196581197, "grad_norm": 1.3609555391016044, "learning_rate": 3.6598261671888623e-06, "loss": 0.3553, "step": 608 }, { "epoch": 5.205128205128205, "grad_norm": 1.3543886822633837, "learning_rate": 3.655859855533859e-06, "loss": 0.4541, "step": 609 }, { "epoch": 5.213675213675214, "grad_norm": 1.249408147562903, "learning_rate": 3.651889840052922e-06, "loss": 0.3994, "step": 610 }, { "epoch": 5.222222222222222, "grad_norm": 1.1992882246674452, "learning_rate": 3.6479161334675294e-06, "loss": 0.5442, "step": 611 }, { "epoch": 5.230769230769231, "grad_norm": 1.2907969275161841, "learning_rate": 3.643938748510989e-06, "loss": 0.4032, "step": 612 }, { "epoch": 5.239316239316239, "grad_norm": 1.2081170928909912, "learning_rate": 3.6399576979283914e-06, "loss": 0.7046, "step": 613 }, { "epoch": 5.247863247863248, "grad_norm": 1.0880564796064907, "learning_rate": 3.6359729944765785e-06, "loss": 0.5074, "step": 614 }, { "epoch": 5.256410256410256, "grad_norm": 1.242466798795238, "learning_rate": 3.631984650924094e-06, "loss": 0.5935, "step": 615 }, { "epoch": 5.264957264957265, "grad_norm": 1.0887798223946241, "learning_rate": 3.6279926800511455e-06, "loss": 0.7886, "step": 616 }, { "epoch": 5.273504273504273, "grad_norm": 1.2491679856445455, "learning_rate": 3.623997094649566e-06, "loss": 0.348, "step": 617 }, { "epoch": 5.282051282051282, "grad_norm": 1.294792724963306, "learning_rate": 3.6199979075227707e-06, "loss": 0.3407, "step": 618 }, { "epoch": 5.2905982905982905, "grad_norm": 1.3554021985070248, "learning_rate": 3.6159951314857145e-06, "loss": 0.3867, "step": 619 }, { "epoch": 5.299145299145299, "grad_norm": 0.9951487626604573, "learning_rate": 3.6119887793648535e-06, "loss": 0.495, "step": 620 }, { "epoch": 5.3076923076923075, "grad_norm": 1.5132542740857915, "learning_rate": 3.607978863998104e-06, "loss": 0.3828, "step": 621 }, { "epoch": 5.316239316239316, "grad_norm": 1.6796037288363033, "learning_rate": 3.6039653982347977e-06, "loss": 0.2866, "step": 622 }, { "epoch": 5.3247863247863245, "grad_norm": 1.7369544007805016, "learning_rate": 3.5999483949356458e-06, "loss": 0.3328, "step": 623 }, { "epoch": 5.333333333333333, "grad_norm": 1.03844187756572, "learning_rate": 3.595927866972694e-06, "loss": 0.3897, "step": 624 }, { "epoch": 5.3418803418803416, "grad_norm": 1.1397342002914956, "learning_rate": 3.5919038272292824e-06, "loss": 0.5871, "step": 625 }, { "epoch": 5.35042735042735, "grad_norm": 0.9852238046959051, "learning_rate": 3.587876288600004e-06, "loss": 0.3391, "step": 626 }, { "epoch": 5.358974358974359, "grad_norm": 1.2750514024039332, "learning_rate": 3.583845263990664e-06, "loss": 0.3513, "step": 627 }, { "epoch": 5.367521367521368, "grad_norm": 1.1066020720861434, "learning_rate": 3.5798107663182386e-06, "loss": 0.4639, "step": 628 }, { "epoch": 5.3760683760683765, "grad_norm": 1.1918671558636216, "learning_rate": 3.5757728085108318e-06, "loss": 0.2819, "step": 629 }, { "epoch": 5.384615384615385, "grad_norm": 1.3152243030901882, "learning_rate": 3.5717314035076355e-06, "loss": 0.5565, "step": 630 }, { "epoch": 5.3931623931623935, "grad_norm": 1.2710610311332853, "learning_rate": 3.5676865642588894e-06, "loss": 0.3711, "step": 631 }, { "epoch": 5.401709401709402, "grad_norm": 1.3852231938798323, "learning_rate": 3.563638303725835e-06, "loss": 0.5479, "step": 632 }, { "epoch": 5.410256410256411, "grad_norm": 1.2285067983502056, "learning_rate": 3.559586634880679e-06, "loss": 0.5268, "step": 633 }, { "epoch": 5.418803418803419, "grad_norm": 1.528384561793807, "learning_rate": 3.5555315707065496e-06, "loss": 0.4178, "step": 634 }, { "epoch": 5.427350427350428, "grad_norm": 1.1960031913100837, "learning_rate": 3.551473124197454e-06, "loss": 0.4534, "step": 635 }, { "epoch": 5.435897435897436, "grad_norm": 1.248321524655849, "learning_rate": 3.5474113083582382e-06, "loss": 0.7638, "step": 636 }, { "epoch": 5.444444444444445, "grad_norm": 1.0905064404938167, "learning_rate": 3.543346136204545e-06, "loss": 0.6237, "step": 637 }, { "epoch": 5.452991452991453, "grad_norm": 1.3017143288995983, "learning_rate": 3.539277620762772e-06, "loss": 0.7304, "step": 638 }, { "epoch": 5.461538461538462, "grad_norm": 1.119653015329565, "learning_rate": 3.53520577507003e-06, "loss": 0.5459, "step": 639 }, { "epoch": 5.47008547008547, "grad_norm": 1.346437171994515, "learning_rate": 3.5311306121741017e-06, "loss": 0.417, "step": 640 }, { "epoch": 5.478632478632479, "grad_norm": 1.0289127371308224, "learning_rate": 3.5270521451333984e-06, "loss": 0.4361, "step": 641 }, { "epoch": 5.487179487179487, "grad_norm": 1.2042095998787006, "learning_rate": 3.522970387016919e-06, "loss": 0.3922, "step": 642 }, { "epoch": 5.495726495726496, "grad_norm": 1.4121964704198753, "learning_rate": 3.5188853509042105e-06, "loss": 0.4761, "step": 643 }, { "epoch": 5.504273504273504, "grad_norm": 1.2150073180857026, "learning_rate": 3.5147970498853214e-06, "loss": 0.4089, "step": 644 }, { "epoch": 5.512820512820513, "grad_norm": 1.1270316353412924, "learning_rate": 3.5107054970607624e-06, "loss": 0.3747, "step": 645 }, { "epoch": 5.521367521367521, "grad_norm": 1.2041504969653847, "learning_rate": 3.5066107055414677e-06, "loss": 0.2461, "step": 646 }, { "epoch": 5.52991452991453, "grad_norm": 1.175505164637916, "learning_rate": 3.5025126884487447e-06, "loss": 0.5827, "step": 647 }, { "epoch": 5.538461538461538, "grad_norm": 1.3339817771254319, "learning_rate": 3.4984114589142388e-06, "loss": 0.3425, "step": 648 }, { "epoch": 5.547008547008547, "grad_norm": 1.2730063860154024, "learning_rate": 3.4943070300798913e-06, "loss": 0.2885, "step": 649 }, { "epoch": 5.555555555555555, "grad_norm": 1.1269118982496695, "learning_rate": 3.4901994150978926e-06, "loss": 0.7544, "step": 650 }, { "epoch": 5.564102564102564, "grad_norm": 1.4064894774351702, "learning_rate": 3.4860886271306433e-06, "loss": 0.4694, "step": 651 }, { "epoch": 5.572649572649572, "grad_norm": 1.0680338687097581, "learning_rate": 3.481974679350712e-06, "loss": 0.3943, "step": 652 }, { "epoch": 5.581196581196581, "grad_norm": 1.2287737197668491, "learning_rate": 3.4778575849407924e-06, "loss": 0.3581, "step": 653 }, { "epoch": 5.589743589743589, "grad_norm": 1.2852278699675668, "learning_rate": 3.473737357093662e-06, "loss": 0.4511, "step": 654 }, { "epoch": 5.598290598290598, "grad_norm": 1.3283726478419335, "learning_rate": 3.4696140090121377e-06, "loss": 0.3284, "step": 655 }, { "epoch": 5.6068376068376065, "grad_norm": 1.0510813178744065, "learning_rate": 3.465487553909035e-06, "loss": 0.3599, "step": 656 }, { "epoch": 5.615384615384615, "grad_norm": 1.0755691838628418, "learning_rate": 3.461358005007128e-06, "loss": 0.577, "step": 657 }, { "epoch": 5.6239316239316235, "grad_norm": 1.064075876776, "learning_rate": 3.4572253755390996e-06, "loss": 0.6382, "step": 658 }, { "epoch": 5.632478632478632, "grad_norm": 1.1490546293761381, "learning_rate": 3.4530896787475083e-06, "loss": 0.4093, "step": 659 }, { "epoch": 5.641025641025641, "grad_norm": 1.0908195852061844, "learning_rate": 3.4489509278847415e-06, "loss": 0.3019, "step": 660 }, { "epoch": 5.64957264957265, "grad_norm": 1.0838622648728164, "learning_rate": 3.44480913621297e-06, "loss": 0.5162, "step": 661 }, { "epoch": 5.6581196581196584, "grad_norm": 1.2952654131629489, "learning_rate": 3.44066431700411e-06, "loss": 0.3326, "step": 662 }, { "epoch": 5.666666666666667, "grad_norm": 1.11428294131442, "learning_rate": 3.436516483539781e-06, "loss": 0.4055, "step": 663 }, { "epoch": 5.6752136752136755, "grad_norm": 1.2330022201567412, "learning_rate": 3.432365649111257e-06, "loss": 0.4222, "step": 664 }, { "epoch": 5.683760683760684, "grad_norm": 1.1952087265548075, "learning_rate": 3.428211827019434e-06, "loss": 0.3838, "step": 665 }, { "epoch": 5.6923076923076925, "grad_norm": 1.1168628551342878, "learning_rate": 3.4240550305747776e-06, "loss": 0.2602, "step": 666 }, { "epoch": 5.700854700854701, "grad_norm": 2.630335521757739, "learning_rate": 3.4198952730972845e-06, "loss": 0.2775, "step": 667 }, { "epoch": 5.7094017094017095, "grad_norm": 1.2094615456741151, "learning_rate": 3.4157325679164416e-06, "loss": 0.5243, "step": 668 }, { "epoch": 5.717948717948718, "grad_norm": 1.091861892576226, "learning_rate": 3.4115669283711795e-06, "loss": 0.3371, "step": 669 }, { "epoch": 5.726495726495727, "grad_norm": 1.2671530096401382, "learning_rate": 3.407398367809832e-06, "loss": 0.553, "step": 670 }, { "epoch": 5.735042735042735, "grad_norm": 1.8341290350898378, "learning_rate": 3.403226899590096e-06, "loss": 0.4215, "step": 671 }, { "epoch": 5.743589743589744, "grad_norm": 1.0560810502003946, "learning_rate": 3.3990525370789793e-06, "loss": 0.5067, "step": 672 }, { "epoch": 5.752136752136752, "grad_norm": 1.1364367551725, "learning_rate": 3.3948752936527722e-06, "loss": 0.3745, "step": 673 }, { "epoch": 5.760683760683761, "grad_norm": 1.0851772890439473, "learning_rate": 3.3906951826969905e-06, "loss": 0.4404, "step": 674 }, { "epoch": 5.769230769230769, "grad_norm": 1.253804399448194, "learning_rate": 3.386512217606339e-06, "loss": 0.5596, "step": 675 }, { "epoch": 5.777777777777778, "grad_norm": 1.0118773013123081, "learning_rate": 3.3823264117846722e-06, "loss": 0.3556, "step": 676 }, { "epoch": 5.786324786324786, "grad_norm": 1.2600330912135416, "learning_rate": 3.378137778644945e-06, "loss": 0.4202, "step": 677 }, { "epoch": 5.794871794871795, "grad_norm": 1.1341711674144634, "learning_rate": 3.3739463316091696e-06, "loss": 0.3549, "step": 678 }, { "epoch": 5.803418803418803, "grad_norm": 1.3648356770575298, "learning_rate": 3.369752084108381e-06, "loss": 0.5829, "step": 679 }, { "epoch": 5.811965811965812, "grad_norm": 1.2817095099769975, "learning_rate": 3.3655550495825824e-06, "loss": 0.4679, "step": 680 }, { "epoch": 5.82051282051282, "grad_norm": 1.3209631982240306, "learning_rate": 3.3613552414807093e-06, "loss": 0.4057, "step": 681 }, { "epoch": 5.829059829059829, "grad_norm": 1.1394673881020136, "learning_rate": 3.3571526732605875e-06, "loss": 0.3996, "step": 682 }, { "epoch": 5.837606837606837, "grad_norm": 1.0155252392209944, "learning_rate": 3.352947358388884e-06, "loss": 0.4856, "step": 683 }, { "epoch": 5.846153846153846, "grad_norm": 1.2795057777037273, "learning_rate": 3.3487393103410683e-06, "loss": 0.5898, "step": 684 }, { "epoch": 5.854700854700854, "grad_norm": 1.2951968385232915, "learning_rate": 3.3445285426013683e-06, "loss": 0.405, "step": 685 }, { "epoch": 5.863247863247864, "grad_norm": 1.4918672670392705, "learning_rate": 3.3403150686627267e-06, "loss": 0.303, "step": 686 }, { "epoch": 5.871794871794872, "grad_norm": 2.006770881098824, "learning_rate": 3.336098902026758e-06, "loss": 0.4886, "step": 687 }, { "epoch": 5.880341880341881, "grad_norm": 1.2868325384326031, "learning_rate": 3.331880056203706e-06, "loss": 0.4736, "step": 688 }, { "epoch": 5.888888888888889, "grad_norm": 1.2436589514049399, "learning_rate": 3.3276585447123957e-06, "loss": 0.5197, "step": 689 }, { "epoch": 5.897435897435898, "grad_norm": 1.352912645734446, "learning_rate": 3.3234343810801995e-06, "loss": 0.5513, "step": 690 }, { "epoch": 5.905982905982906, "grad_norm": 1.1902349811855824, "learning_rate": 3.319207578842985e-06, "loss": 0.4307, "step": 691 }, { "epoch": 5.914529914529915, "grad_norm": 1.2876545508730952, "learning_rate": 3.314978151545076e-06, "loss": 0.6827, "step": 692 }, { "epoch": 5.923076923076923, "grad_norm": 1.4876272116617923, "learning_rate": 3.3107461127392072e-06, "loss": 0.4238, "step": 693 }, { "epoch": 5.931623931623932, "grad_norm": 1.0911236336199452, "learning_rate": 3.306511475986482e-06, "loss": 0.3803, "step": 694 }, { "epoch": 5.94017094017094, "grad_norm": 1.1717280095017706, "learning_rate": 3.3022742548563293e-06, "loss": 0.6268, "step": 695 }, { "epoch": 5.948717948717949, "grad_norm": 1.5095367793755137, "learning_rate": 3.2980344629264583e-06, "loss": 0.4046, "step": 696 }, { "epoch": 5.957264957264957, "grad_norm": 1.1688187493069797, "learning_rate": 3.293792113782816e-06, "loss": 0.3746, "step": 697 }, { "epoch": 5.965811965811966, "grad_norm": 2.38178908613328, "learning_rate": 3.289547221019546e-06, "loss": 0.386, "step": 698 }, { "epoch": 5.9743589743589745, "grad_norm": 1.0387739711325623, "learning_rate": 3.285299798238938e-06, "loss": 0.5492, "step": 699 }, { "epoch": 5.982905982905983, "grad_norm": 1.1945528947453434, "learning_rate": 3.281049859051394e-06, "loss": 0.2644, "step": 700 }, { "epoch": 5.9914529914529915, "grad_norm": 1.2000576561757432, "learning_rate": 3.276797417075377e-06, "loss": 0.4911, "step": 701 }, { "epoch": 6.0, "grad_norm": 1.6335354555713457, "learning_rate": 3.272542485937369e-06, "loss": 0.493, "step": 702 }, { "epoch": 6.0085470085470085, "grad_norm": 1.8696148210468555, "learning_rate": 3.26828507927183e-06, "loss": 0.3595, "step": 703 }, { "epoch": 6.017094017094017, "grad_norm": 1.394253213921132, "learning_rate": 3.264025210721153e-06, "loss": 0.3482, "step": 704 }, { "epoch": 6.0256410256410255, "grad_norm": 1.5215937520682763, "learning_rate": 3.2597628939356174e-06, "loss": 0.5041, "step": 705 }, { "epoch": 6.034188034188034, "grad_norm": 1.2837167314940414, "learning_rate": 3.25549814257335e-06, "loss": 0.5273, "step": 706 }, { "epoch": 6.042735042735043, "grad_norm": 1.7025099621293658, "learning_rate": 3.2512309703002776e-06, "loss": 0.5485, "step": 707 }, { "epoch": 6.051282051282051, "grad_norm": 2.0471758371089313, "learning_rate": 3.2469613907900847e-06, "loss": 0.3261, "step": 708 }, { "epoch": 6.05982905982906, "grad_norm": 1.918552218538202, "learning_rate": 3.2426894177241707e-06, "loss": 0.5351, "step": 709 }, { "epoch": 6.068376068376068, "grad_norm": 1.4591389372643695, "learning_rate": 3.2384150647916033e-06, "loss": 0.3315, "step": 710 }, { "epoch": 6.076923076923077, "grad_norm": 1.2241942485088706, "learning_rate": 3.2341383456890776e-06, "loss": 0.4083, "step": 711 }, { "epoch": 6.085470085470085, "grad_norm": 1.4233353451639101, "learning_rate": 3.229859274120869e-06, "loss": 0.3002, "step": 712 }, { "epoch": 6.094017094017094, "grad_norm": 1.1255702206320295, "learning_rate": 3.2255778637987935e-06, "loss": 0.2582, "step": 713 }, { "epoch": 6.102564102564102, "grad_norm": 1.5685617433721264, "learning_rate": 3.2212941284421595e-06, "loss": 0.2923, "step": 714 }, { "epoch": 6.111111111111111, "grad_norm": 1.4408254299138377, "learning_rate": 3.217008081777726e-06, "loss": 0.2306, "step": 715 }, { "epoch": 6.119658119658119, "grad_norm": 1.3513374270055847, "learning_rate": 3.2127197375396596e-06, "loss": 0.8027, "step": 716 }, { "epoch": 6.128205128205128, "grad_norm": 1.821456048145893, "learning_rate": 3.208429109469488e-06, "loss": 0.4388, "step": 717 }, { "epoch": 6.136752136752137, "grad_norm": 1.2713819489136216, "learning_rate": 3.204136211316057e-06, "loss": 0.5603, "step": 718 }, { "epoch": 6.145299145299146, "grad_norm": 1.5416645237683366, "learning_rate": 3.199841056835489e-06, "loss": 0.4057, "step": 719 }, { "epoch": 6.153846153846154, "grad_norm": 1.4766742725804678, "learning_rate": 3.195543659791132e-06, "loss": 0.4062, "step": 720 }, { "epoch": 6.162393162393163, "grad_norm": 1.0920488604517398, "learning_rate": 3.191244033953524e-06, "loss": 0.5131, "step": 721 }, { "epoch": 6.170940170940171, "grad_norm": 2.1075139352156635, "learning_rate": 3.1869421931003446e-06, "loss": 0.2847, "step": 722 }, { "epoch": 6.17948717948718, "grad_norm": 1.4061808008069692, "learning_rate": 3.182638151016369e-06, "loss": 0.5075, "step": 723 }, { "epoch": 6.188034188034188, "grad_norm": 0.9318584748785151, "learning_rate": 3.1783319214934274e-06, "loss": 0.4692, "step": 724 }, { "epoch": 6.196581196581197, "grad_norm": 1.5607675486820585, "learning_rate": 3.17402351833036e-06, "loss": 0.4791, "step": 725 }, { "epoch": 6.205128205128205, "grad_norm": 1.8849662864698848, "learning_rate": 3.1697129553329708e-06, "loss": 0.6563, "step": 726 }, { "epoch": 6.213675213675214, "grad_norm": 1.0331208261591494, "learning_rate": 3.1654002463139854e-06, "loss": 0.3876, "step": 727 }, { "epoch": 6.222222222222222, "grad_norm": 1.3672949819681985, "learning_rate": 3.1610854050930063e-06, "loss": 0.4545, "step": 728 }, { "epoch": 6.230769230769231, "grad_norm": 1.325503991861968, "learning_rate": 3.1567684454964674e-06, "loss": 0.4732, "step": 729 }, { "epoch": 6.239316239316239, "grad_norm": 0.9754927814726547, "learning_rate": 3.1524493813575936e-06, "loss": 0.5361, "step": 730 }, { "epoch": 6.247863247863248, "grad_norm": 1.475889572340759, "learning_rate": 3.1481282265163493e-06, "loss": 0.555, "step": 731 }, { "epoch": 6.256410256410256, "grad_norm": 1.2242919034236064, "learning_rate": 3.1438049948194006e-06, "loss": 0.3423, "step": 732 }, { "epoch": 6.264957264957265, "grad_norm": 1.1461098912240082, "learning_rate": 3.1394797001200705e-06, "loss": 0.4012, "step": 733 }, { "epoch": 6.273504273504273, "grad_norm": 1.24676230120588, "learning_rate": 3.1351523562782893e-06, "loss": 0.2398, "step": 734 }, { "epoch": 6.282051282051282, "grad_norm": 1.6064295658402483, "learning_rate": 3.1308229771605546e-06, "loss": 0.3138, "step": 735 }, { "epoch": 6.2905982905982905, "grad_norm": 1.1099122618300608, "learning_rate": 3.1264915766398872e-06, "loss": 0.4298, "step": 736 }, { "epoch": 6.299145299145299, "grad_norm": 1.1345723411356023, "learning_rate": 3.1221581685957837e-06, "loss": 0.3354, "step": 737 }, { "epoch": 6.3076923076923075, "grad_norm": 1.2491753534521146, "learning_rate": 3.117822766914174e-06, "loss": 0.6514, "step": 738 }, { "epoch": 6.316239316239316, "grad_norm": 1.3935328769910025, "learning_rate": 3.1134853854873774e-06, "loss": 0.384, "step": 739 }, { "epoch": 6.3247863247863245, "grad_norm": 1.025200062176103, "learning_rate": 3.109146038214055e-06, "loss": 0.4001, "step": 740 }, { "epoch": 6.333333333333333, "grad_norm": 1.2195206502277662, "learning_rate": 3.1048047389991693e-06, "loss": 0.5226, "step": 741 }, { "epoch": 6.3418803418803416, "grad_norm": 1.32123049941495, "learning_rate": 3.1004615017539375e-06, "loss": 0.3312, "step": 742 }, { "epoch": 6.35042735042735, "grad_norm": 1.2390374240886866, "learning_rate": 3.096116340395783e-06, "loss": 0.5162, "step": 743 }, { "epoch": 6.358974358974359, "grad_norm": 1.549536241283591, "learning_rate": 3.0917692688483023e-06, "loss": 0.3452, "step": 744 }, { "epoch": 6.367521367521368, "grad_norm": 1.163597656164462, "learning_rate": 3.0874203010412057e-06, "loss": 0.4066, "step": 745 }, { "epoch": 6.3760683760683765, "grad_norm": 1.4309744794804062, "learning_rate": 3.0830694509102835e-06, "loss": 0.3004, "step": 746 }, { "epoch": 6.384615384615385, "grad_norm": 1.9244526378821927, "learning_rate": 3.0787167323973584e-06, "loss": 0.4615, "step": 747 }, { "epoch": 6.3931623931623935, "grad_norm": 1.221426024391306, "learning_rate": 3.074362159450236e-06, "loss": 0.2793, "step": 748 }, { "epoch": 6.401709401709402, "grad_norm": 1.2043561131798144, "learning_rate": 3.070005746022669e-06, "loss": 0.2147, "step": 749 }, { "epoch": 6.410256410256411, "grad_norm": 1.6838636399693165, "learning_rate": 3.0656475060743065e-06, "loss": 0.593, "step": 750 }, { "epoch": 6.418803418803419, "grad_norm": 1.2949091730132467, "learning_rate": 3.061287453570646e-06, "loss": 0.5454, "step": 751 }, { "epoch": 6.427350427350428, "grad_norm": 1.4030184121773681, "learning_rate": 3.056925602483e-06, "loss": 0.3405, "step": 752 }, { "epoch": 6.435897435897436, "grad_norm": 1.2147418551070128, "learning_rate": 3.052561966788441e-06, "loss": 0.3789, "step": 753 }, { "epoch": 6.444444444444445, "grad_norm": 1.2739180484045751, "learning_rate": 3.0481965604697582e-06, "loss": 0.3262, "step": 754 }, { "epoch": 6.452991452991453, "grad_norm": 1.5444059820139582, "learning_rate": 3.043829397515419e-06, "loss": 0.3836, "step": 755 }, { "epoch": 6.461538461538462, "grad_norm": 1.2628409498676159, "learning_rate": 3.039460491919516e-06, "loss": 0.5492, "step": 756 }, { "epoch": 6.47008547008547, "grad_norm": 1.1975440242064967, "learning_rate": 3.0350898576817268e-06, "loss": 0.4043, "step": 757 }, { "epoch": 6.478632478632479, "grad_norm": 1.2299452398293043, "learning_rate": 3.03071750880727e-06, "loss": 0.5042, "step": 758 }, { "epoch": 6.487179487179487, "grad_norm": 1.296600850446519, "learning_rate": 3.0263434593068562e-06, "loss": 0.3901, "step": 759 }, { "epoch": 6.495726495726496, "grad_norm": 1.0714540115525397, "learning_rate": 3.021967723196647e-06, "loss": 0.5952, "step": 760 }, { "epoch": 6.504273504273504, "grad_norm": 1.2212295717383705, "learning_rate": 3.017590314498208e-06, "loss": 0.4616, "step": 761 }, { "epoch": 6.512820512820513, "grad_norm": 1.8417042430435893, "learning_rate": 3.0132112472384652e-06, "loss": 0.2779, "step": 762 }, { "epoch": 6.521367521367521, "grad_norm": 1.4681440287957088, "learning_rate": 3.0088305354496574e-06, "loss": 0.4951, "step": 763 }, { "epoch": 6.52991452991453, "grad_norm": 1.3911986079523737, "learning_rate": 3.004448193169294e-06, "loss": 0.4351, "step": 764 }, { "epoch": 6.538461538461538, "grad_norm": 1.131630270165073, "learning_rate": 3.0000642344401115e-06, "loss": 0.4227, "step": 765 }, { "epoch": 6.547008547008547, "grad_norm": 1.2607186615387505, "learning_rate": 2.9956786733100225e-06, "loss": 0.4701, "step": 766 }, { "epoch": 6.555555555555555, "grad_norm": 1.0732664543097763, "learning_rate": 2.9912915238320755e-06, "loss": 0.37, "step": 767 }, { "epoch": 6.564102564102564, "grad_norm": 1.0692993622926654, "learning_rate": 2.9869028000644102e-06, "loss": 0.4654, "step": 768 }, { "epoch": 6.572649572649572, "grad_norm": 1.4021715786112823, "learning_rate": 2.9825125160702096e-06, "loss": 0.6461, "step": 769 }, { "epoch": 6.581196581196581, "grad_norm": 1.4067036745056627, "learning_rate": 2.978120685917656e-06, "loss": 0.5253, "step": 770 }, { "epoch": 6.589743589743589, "grad_norm": 1.3331749163814144, "learning_rate": 2.9737273236798868e-06, "loss": 0.2276, "step": 771 }, { "epoch": 6.598290598290598, "grad_norm": 1.0606838977666249, "learning_rate": 2.9693324434349486e-06, "loss": 0.3774, "step": 772 }, { "epoch": 6.6068376068376065, "grad_norm": 1.3127138222755697, "learning_rate": 2.9649360592657526e-06, "loss": 0.3676, "step": 773 }, { "epoch": 6.615384615384615, "grad_norm": 1.2133475948912267, "learning_rate": 2.960538185260029e-06, "loss": 0.4022, "step": 774 }, { "epoch": 6.6239316239316235, "grad_norm": 1.0322313184125005, "learning_rate": 2.956138835510282e-06, "loss": 0.4228, "step": 775 }, { "epoch": 6.632478632478632, "grad_norm": 1.2342645552359022, "learning_rate": 2.9517380241137437e-06, "loss": 0.3714, "step": 776 }, { "epoch": 6.641025641025641, "grad_norm": 1.3392632273965621, "learning_rate": 2.9473357651723324e-06, "loss": 0.3647, "step": 777 }, { "epoch": 6.64957264957265, "grad_norm": 1.4710559108999355, "learning_rate": 2.942932072792602e-06, "loss": 0.5227, "step": 778 }, { "epoch": 6.6581196581196584, "grad_norm": 1.425011112204164, "learning_rate": 2.938526961085701e-06, "loss": 0.3658, "step": 779 }, { "epoch": 6.666666666666667, "grad_norm": 1.3501743098448087, "learning_rate": 2.9341204441673267e-06, "loss": 0.3663, "step": 780 }, { "epoch": 6.6752136752136755, "grad_norm": 1.2064409829036324, "learning_rate": 2.929712536157677e-06, "loss": 0.4467, "step": 781 }, { "epoch": 6.683760683760684, "grad_norm": 1.1611425318187865, "learning_rate": 2.925303251181411e-06, "loss": 0.5549, "step": 782 }, { "epoch": 6.6923076923076925, "grad_norm": 1.6675008400391373, "learning_rate": 2.920892603367596e-06, "loss": 0.3228, "step": 783 }, { "epoch": 6.700854700854701, "grad_norm": 1.093008739070284, "learning_rate": 2.916480606849671e-06, "loss": 0.487, "step": 784 }, { "epoch": 6.7094017094017095, "grad_norm": 1.3507494915995428, "learning_rate": 2.9120672757653917e-06, "loss": 0.4002, "step": 785 }, { "epoch": 6.717948717948718, "grad_norm": 1.280608215049592, "learning_rate": 2.907652624256794e-06, "loss": 0.5651, "step": 786 }, { "epoch": 6.726495726495727, "grad_norm": 1.1277587047710411, "learning_rate": 2.903236666470143e-06, "loss": 0.5009, "step": 787 }, { "epoch": 6.735042735042735, "grad_norm": 1.4691185750909477, "learning_rate": 2.89881941655589e-06, "loss": 0.379, "step": 788 }, { "epoch": 6.743589743589744, "grad_norm": 1.3006869233227, "learning_rate": 2.8944008886686288e-06, "loss": 0.2883, "step": 789 }, { "epoch": 6.752136752136752, "grad_norm": 1.5404890921238719, "learning_rate": 2.889981096967045e-06, "loss": 0.5591, "step": 790 }, { "epoch": 6.760683760683761, "grad_norm": 1.5129641067947304, "learning_rate": 2.8855600556138757e-06, "loss": 0.4018, "step": 791 }, { "epoch": 6.769230769230769, "grad_norm": 1.7543516151257428, "learning_rate": 2.881137778775864e-06, "loss": 0.2992, "step": 792 }, { "epoch": 6.777777777777778, "grad_norm": 1.702284294452542, "learning_rate": 2.876714280623708e-06, "loss": 0.3827, "step": 793 }, { "epoch": 6.786324786324786, "grad_norm": 1.4614297879108673, "learning_rate": 2.872289575332023e-06, "loss": 0.3365, "step": 794 }, { "epoch": 6.794871794871795, "grad_norm": 1.3849915243782156, "learning_rate": 2.8678636770792907e-06, "loss": 0.4058, "step": 795 }, { "epoch": 6.803418803418803, "grad_norm": 1.26681353460394, "learning_rate": 2.863436600047815e-06, "loss": 0.3846, "step": 796 }, { "epoch": 6.811965811965812, "grad_norm": 1.6534025844078974, "learning_rate": 2.8590083584236792e-06, "loss": 0.5269, "step": 797 }, { "epoch": 6.82051282051282, "grad_norm": 1.360051665942373, "learning_rate": 2.854578966396697e-06, "loss": 0.5133, "step": 798 }, { "epoch": 6.829059829059829, "grad_norm": 1.0485488708326347, "learning_rate": 2.8501484381603685e-06, "loss": 0.2431, "step": 799 }, { "epoch": 6.837606837606837, "grad_norm": 1.4629730877013372, "learning_rate": 2.8457167879118332e-06, "loss": 0.6504, "step": 800 }, { "epoch": 6.846153846153846, "grad_norm": 1.2010217515771036, "learning_rate": 2.8412840298518295e-06, "loss": 0.4259, "step": 801 }, { "epoch": 6.854700854700854, "grad_norm": 1.5172571882549153, "learning_rate": 2.836850178184642e-06, "loss": 0.267, "step": 802 }, { "epoch": 6.863247863247864, "grad_norm": 1.274248209706548, "learning_rate": 2.8324152471180634e-06, "loss": 0.3201, "step": 803 }, { "epoch": 6.871794871794872, "grad_norm": 1.2852432803446472, "learning_rate": 2.8279792508633415e-06, "loss": 0.2461, "step": 804 }, { "epoch": 6.880341880341881, "grad_norm": 1.0075068351453549, "learning_rate": 2.8235422036351384e-06, "loss": 0.4763, "step": 805 }, { "epoch": 6.888888888888889, "grad_norm": 1.255947186502288, "learning_rate": 2.8191041196514874e-06, "loss": 0.2422, "step": 806 }, { "epoch": 6.897435897435898, "grad_norm": 1.226037306889786, "learning_rate": 2.8146650131337376e-06, "loss": 0.4444, "step": 807 }, { "epoch": 6.905982905982906, "grad_norm": 1.7899615704173477, "learning_rate": 2.81022489830652e-06, "loss": 0.4276, "step": 808 }, { "epoch": 6.914529914529915, "grad_norm": 0.9990455509140496, "learning_rate": 2.8057837893976958e-06, "loss": 0.4717, "step": 809 }, { "epoch": 6.923076923076923, "grad_norm": 1.255311068942597, "learning_rate": 2.8013417006383078e-06, "loss": 0.489, "step": 810 }, { "epoch": 6.931623931623932, "grad_norm": 0.914637503870123, "learning_rate": 2.7968986462625436e-06, "loss": 0.3367, "step": 811 }, { "epoch": 6.94017094017094, "grad_norm": 1.195207651209717, "learning_rate": 2.7924546405076837e-06, "loss": 0.4924, "step": 812 }, { "epoch": 6.948717948717949, "grad_norm": 1.2462780058107867, "learning_rate": 2.788009697614053e-06, "loss": 0.4046, "step": 813 }, { "epoch": 6.957264957264957, "grad_norm": 1.1795029432035888, "learning_rate": 2.7835638318249856e-06, "loss": 0.5524, "step": 814 }, { "epoch": 6.965811965811966, "grad_norm": 1.2531330155038323, "learning_rate": 2.7791170573867698e-06, "loss": 0.4246, "step": 815 }, { "epoch": 6.9743589743589745, "grad_norm": 1.3040915528226138, "learning_rate": 2.7746693885486044e-06, "loss": 0.4456, "step": 816 }, { "epoch": 6.982905982905983, "grad_norm": 1.293729203493905, "learning_rate": 2.770220839562556e-06, "loss": 0.4462, "step": 817 }, { "epoch": 6.9914529914529915, "grad_norm": 1.3294535147477113, "learning_rate": 2.765771424683513e-06, "loss": 0.2795, "step": 818 }, { "epoch": 7.0, "grad_norm": 1.1599921647100595, "learning_rate": 2.761321158169134e-06, "loss": 0.4591, "step": 819 }, { "epoch": 7.0085470085470085, "grad_norm": 1.584598458934145, "learning_rate": 2.7568700542798112e-06, "loss": 0.4017, "step": 820 }, { "epoch": 7.017094017094017, "grad_norm": 1.3451576551405218, "learning_rate": 2.7524181272786153e-06, "loss": 0.6676, "step": 821 }, { "epoch": 7.0256410256410255, "grad_norm": 1.6154170245528818, "learning_rate": 2.747965391431261e-06, "loss": 0.3913, "step": 822 }, { "epoch": 7.034188034188034, "grad_norm": 1.1220733748458145, "learning_rate": 2.743511861006049e-06, "loss": 0.3535, "step": 823 }, { "epoch": 7.042735042735043, "grad_norm": 3.849959694375653, "learning_rate": 2.739057550273828e-06, "loss": 0.4753, "step": 824 }, { "epoch": 7.051282051282051, "grad_norm": 3.7643260373905805, "learning_rate": 2.7346024735079483e-06, "loss": 0.2384, "step": 825 }, { "epoch": 7.05982905982906, "grad_norm": 2.169051331573644, "learning_rate": 2.7301466449842147e-06, "loss": 0.3003, "step": 826 }, { "epoch": 7.068376068376068, "grad_norm": 1.1849827445410177, "learning_rate": 2.725690078980838e-06, "loss": 0.5765, "step": 827 }, { "epoch": 7.076923076923077, "grad_norm": 1.532957836247954, "learning_rate": 2.7212327897783963e-06, "loss": 0.4367, "step": 828 }, { "epoch": 7.085470085470085, "grad_norm": 1.9990747567784164, "learning_rate": 2.7167747916597825e-06, "loss": 0.1999, "step": 829 }, { "epoch": 7.094017094017094, "grad_norm": 1.0102970931842459, "learning_rate": 2.7123160989101623e-06, "loss": 0.3689, "step": 830 }, { "epoch": 7.102564102564102, "grad_norm": 1.456532466139522, "learning_rate": 2.7078567258169264e-06, "loss": 0.472, "step": 831 }, { "epoch": 7.111111111111111, "grad_norm": 1.4365042757644924, "learning_rate": 2.703396686669646e-06, "loss": 0.2607, "step": 832 }, { "epoch": 7.119658119658119, "grad_norm": 1.5344758159392708, "learning_rate": 2.698935995760027e-06, "loss": 0.2563, "step": 833 }, { "epoch": 7.128205128205128, "grad_norm": 1.8982223511921632, "learning_rate": 2.6944746673818623e-06, "loss": 0.2973, "step": 834 }, { "epoch": 7.136752136752137, "grad_norm": 1.3744560603995541, "learning_rate": 2.6900127158309903e-06, "loss": 0.4143, "step": 835 }, { "epoch": 7.145299145299146, "grad_norm": 1.3900882827182501, "learning_rate": 2.6855501554052433e-06, "loss": 0.4789, "step": 836 }, { "epoch": 7.153846153846154, "grad_norm": 1.9535660899006906, "learning_rate": 2.6810870004044065e-06, "loss": 0.3261, "step": 837 }, { "epoch": 7.162393162393163, "grad_norm": 1.2683908981688092, "learning_rate": 2.6766232651301694e-06, "loss": 0.3066, "step": 838 }, { "epoch": 7.170940170940171, "grad_norm": 1.1018165851573016, "learning_rate": 2.672158963886082e-06, "loss": 0.4697, "step": 839 }, { "epoch": 7.17948717948718, "grad_norm": 1.7661043091369681, "learning_rate": 2.667694110977506e-06, "loss": 0.3914, "step": 840 }, { "epoch": 7.188034188034188, "grad_norm": 1.2250139257227237, "learning_rate": 2.6632287207115735e-06, "loss": 0.4166, "step": 841 }, { "epoch": 7.196581196581197, "grad_norm": 1.143439941874872, "learning_rate": 2.6587628073971366e-06, "loss": 0.5522, "step": 842 }, { "epoch": 7.205128205128205, "grad_norm": 1.2894052072252156, "learning_rate": 2.654296385344724e-06, "loss": 0.3279, "step": 843 }, { "epoch": 7.213675213675214, "grad_norm": 1.4995290175071838, "learning_rate": 2.6498294688664937e-06, "loss": 0.4711, "step": 844 }, { "epoch": 7.222222222222222, "grad_norm": 1.273027740413982, "learning_rate": 2.6453620722761897e-06, "loss": 0.4302, "step": 845 }, { "epoch": 7.230769230769231, "grad_norm": 1.4543407022876726, "learning_rate": 2.6408942098890937e-06, "loss": 0.4384, "step": 846 }, { "epoch": 7.239316239316239, "grad_norm": 1.2122402658494311, "learning_rate": 2.6364258960219794e-06, "loss": 0.5011, "step": 847 }, { "epoch": 7.247863247863248, "grad_norm": 1.2195979451169634, "learning_rate": 2.631957144993068e-06, "loss": 0.3118, "step": 848 }, { "epoch": 7.256410256410256, "grad_norm": 1.0566516660523817, "learning_rate": 2.6274879711219816e-06, "loss": 0.4623, "step": 849 }, { "epoch": 7.264957264957265, "grad_norm": 1.0439581131472695, "learning_rate": 2.6230183887296955e-06, "loss": 0.5924, "step": 850 }, { "epoch": 7.273504273504273, "grad_norm": 1.14057396639862, "learning_rate": 2.6185484121384974e-06, "loss": 0.5444, "step": 851 }, { "epoch": 7.282051282051282, "grad_norm": 1.531318860119953, "learning_rate": 2.6140780556719354e-06, "loss": 0.2808, "step": 852 }, { "epoch": 7.2905982905982905, "grad_norm": 1.230993307344185, "learning_rate": 2.6096073336547757e-06, "loss": 0.2989, "step": 853 }, { "epoch": 7.299145299145299, "grad_norm": 1.2982927110560112, "learning_rate": 2.6051362604129553e-06, "loss": 0.3645, "step": 854 }, { "epoch": 7.3076923076923075, "grad_norm": 1.0710170660050595, "learning_rate": 2.6006648502735384e-06, "loss": 0.3189, "step": 855 }, { "epoch": 7.316239316239316, "grad_norm": 1.4129830299027597, "learning_rate": 2.5961931175646658e-06, "loss": 0.2819, "step": 856 }, { "epoch": 7.3247863247863245, "grad_norm": 1.1782095507296886, "learning_rate": 2.591721076615517e-06, "loss": 0.2548, "step": 857 }, { "epoch": 7.333333333333333, "grad_norm": 1.1081164425839658, "learning_rate": 2.587248741756253e-06, "loss": 0.413, "step": 858 }, { "epoch": 7.3418803418803416, "grad_norm": 1.0167105430488317, "learning_rate": 2.5827761273179795e-06, "loss": 0.4232, "step": 859 }, { "epoch": 7.35042735042735, "grad_norm": 1.9693966463338093, "learning_rate": 2.578303247632701e-06, "loss": 0.4598, "step": 860 }, { "epoch": 7.358974358974359, "grad_norm": 1.2184335947586225, "learning_rate": 2.5738301170332665e-06, "loss": 0.5022, "step": 861 }, { "epoch": 7.367521367521368, "grad_norm": 1.6767314445095651, "learning_rate": 2.5693567498533315e-06, "loss": 0.3089, "step": 862 }, { "epoch": 7.3760683760683765, "grad_norm": 1.1887353726846537, "learning_rate": 2.5648831604273117e-06, "loss": 0.3041, "step": 863 }, { "epoch": 7.384615384615385, "grad_norm": 1.257527921008963, "learning_rate": 2.560409363090331e-06, "loss": 0.218, "step": 864 }, { "epoch": 7.3931623931623935, "grad_norm": 1.5297790295923273, "learning_rate": 2.555935372178183e-06, "loss": 0.2525, "step": 865 }, { "epoch": 7.401709401709402, "grad_norm": 1.2253355789146119, "learning_rate": 2.5514612020272792e-06, "loss": 0.2754, "step": 866 }, { "epoch": 7.410256410256411, "grad_norm": 1.8586602173239468, "learning_rate": 2.546986866974606e-06, "loss": 0.3116, "step": 867 }, { "epoch": 7.418803418803419, "grad_norm": 1.3071314590457581, "learning_rate": 2.54251238135768e-06, "loss": 0.3596, "step": 868 }, { "epoch": 7.427350427350428, "grad_norm": 1.4114379208643046, "learning_rate": 2.5380377595144984e-06, "loss": 0.5204, "step": 869 }, { "epoch": 7.435897435897436, "grad_norm": 2.1954669593462475, "learning_rate": 2.533563015783494e-06, "loss": 0.4798, "step": 870 }, { "epoch": 7.444444444444445, "grad_norm": 1.0331522720681556, "learning_rate": 2.5290881645034932e-06, "loss": 0.4036, "step": 871 }, { "epoch": 7.452991452991453, "grad_norm": 1.512218877417084, "learning_rate": 2.524613220013664e-06, "loss": 0.2333, "step": 872 }, { "epoch": 7.461538461538462, "grad_norm": 1.0955389138492935, "learning_rate": 2.5201381966534748e-06, "loss": 0.6983, "step": 873 }, { "epoch": 7.47008547008547, "grad_norm": 1.4358219923803184, "learning_rate": 2.515663108762648e-06, "loss": 0.5395, "step": 874 }, { "epoch": 7.478632478632479, "grad_norm": 1.2250651733430429, "learning_rate": 2.511187970681109e-06, "loss": 0.3976, "step": 875 }, { "epoch": 7.487179487179487, "grad_norm": 1.2100214960869564, "learning_rate": 2.5067127967489464e-06, "loss": 0.3228, "step": 876 }, { "epoch": 7.495726495726496, "grad_norm": 1.03209917062225, "learning_rate": 2.5022376013063653e-06, "loss": 0.2953, "step": 877 }, { "epoch": 7.504273504273504, "grad_norm": 2.0442691089803264, "learning_rate": 2.497762398693636e-06, "loss": 0.3491, "step": 878 }, { "epoch": 7.512820512820513, "grad_norm": 1.1238804763641137, "learning_rate": 2.493287203251054e-06, "loss": 0.5455, "step": 879 }, { "epoch": 7.521367521367521, "grad_norm": 1.2347502797408516, "learning_rate": 2.4888120293188915e-06, "loss": 0.5476, "step": 880 }, { "epoch": 7.52991452991453, "grad_norm": 1.0667993198312395, "learning_rate": 2.484336891237353e-06, "loss": 0.4568, "step": 881 }, { "epoch": 7.538461538461538, "grad_norm": 1.1705030742580442, "learning_rate": 2.4798618033465256e-06, "loss": 0.3531, "step": 882 }, { "epoch": 7.547008547008547, "grad_norm": 1.4056506930020038, "learning_rate": 2.4753867799863365e-06, "loss": 0.262, "step": 883 }, { "epoch": 7.555555555555555, "grad_norm": 1.4336180176173927, "learning_rate": 2.470911835496508e-06, "loss": 0.3073, "step": 884 }, { "epoch": 7.564102564102564, "grad_norm": 1.2364038729002997, "learning_rate": 2.466436984216507e-06, "loss": 0.3548, "step": 885 }, { "epoch": 7.572649572649572, "grad_norm": 1.9618172986680815, "learning_rate": 2.4619622404855025e-06, "loss": 0.3784, "step": 886 }, { "epoch": 7.581196581196581, "grad_norm": 0.9993817480313798, "learning_rate": 2.4574876186423203e-06, "loss": 0.4865, "step": 887 }, { "epoch": 7.589743589743589, "grad_norm": 1.1318460373221935, "learning_rate": 2.4530131330253946e-06, "loss": 0.5475, "step": 888 }, { "epoch": 7.598290598290598, "grad_norm": 2.506285524005619, "learning_rate": 2.4485387979727216e-06, "loss": 0.3429, "step": 889 }, { "epoch": 7.6068376068376065, "grad_norm": 1.8244156537083576, "learning_rate": 2.4440646278218178e-06, "loss": 0.4488, "step": 890 }, { "epoch": 7.615384615384615, "grad_norm": 1.456426214018476, "learning_rate": 2.43959063690967e-06, "loss": 0.338, "step": 891 }, { "epoch": 7.6239316239316235, "grad_norm": 1.1028797644342125, "learning_rate": 2.435116839572689e-06, "loss": 0.4484, "step": 892 }, { "epoch": 7.632478632478632, "grad_norm": 1.4439295173359044, "learning_rate": 2.430643250146669e-06, "loss": 0.6095, "step": 893 }, { "epoch": 7.641025641025641, "grad_norm": 1.2732755672552745, "learning_rate": 2.426169882966735e-06, "loss": 0.5383, "step": 894 }, { "epoch": 7.64957264957265, "grad_norm": 1.0223416245596049, "learning_rate": 2.4216967523673e-06, "loss": 0.3725, "step": 895 }, { "epoch": 7.6581196581196584, "grad_norm": 1.1127930023622536, "learning_rate": 2.4172238726820205e-06, "loss": 0.1803, "step": 896 }, { "epoch": 7.666666666666667, "grad_norm": 1.187611997597461, "learning_rate": 2.4127512582437486e-06, "loss": 0.3982, "step": 897 }, { "epoch": 7.6752136752136755, "grad_norm": 1.3600435750463735, "learning_rate": 2.4082789233844837e-06, "loss": 0.4175, "step": 898 }, { "epoch": 7.683760683760684, "grad_norm": 1.3026323364977672, "learning_rate": 2.403806882435334e-06, "loss": 0.3755, "step": 899 }, { "epoch": 7.6923076923076925, "grad_norm": 1.1445423564052852, "learning_rate": 2.399335149726463e-06, "loss": 0.416, "step": 900 }, { "epoch": 7.700854700854701, "grad_norm": 1.453300314929189, "learning_rate": 2.394863739587045e-06, "loss": 0.2368, "step": 901 }, { "epoch": 7.7094017094017095, "grad_norm": 1.406547629276672, "learning_rate": 2.3903926663452255e-06, "loss": 0.3735, "step": 902 }, { "epoch": 7.717948717948718, "grad_norm": 1.4386212720763296, "learning_rate": 2.385921944328066e-06, "loss": 0.2898, "step": 903 }, { "epoch": 7.726495726495727, "grad_norm": 1.8255812573863117, "learning_rate": 2.3814515878615035e-06, "loss": 0.4494, "step": 904 }, { "epoch": 7.735042735042735, "grad_norm": 1.4340093814213342, "learning_rate": 2.376981611270305e-06, "loss": 0.4222, "step": 905 }, { "epoch": 7.743589743589744, "grad_norm": 1.3262164726354702, "learning_rate": 2.372512028878019e-06, "loss": 0.3349, "step": 906 }, { "epoch": 7.752136752136752, "grad_norm": 1.556934377276059, "learning_rate": 2.3680428550069327e-06, "loss": 0.2797, "step": 907 }, { "epoch": 7.760683760683761, "grad_norm": 1.1894029158833457, "learning_rate": 2.3635741039780214e-06, "loss": 0.3009, "step": 908 }, { "epoch": 7.769230769230769, "grad_norm": 1.1182938806964584, "learning_rate": 2.3591057901109063e-06, "loss": 0.4986, "step": 909 }, { "epoch": 7.777777777777778, "grad_norm": 1.0919187697035915, "learning_rate": 2.3546379277238107e-06, "loss": 0.4238, "step": 910 }, { "epoch": 7.786324786324786, "grad_norm": 1.4234512798748302, "learning_rate": 2.3501705311335067e-06, "loss": 0.3576, "step": 911 }, { "epoch": 7.794871794871795, "grad_norm": 1.345019755234573, "learning_rate": 2.3457036146552766e-06, "loss": 0.275, "step": 912 }, { "epoch": 7.803418803418803, "grad_norm": 1.2026943698155572, "learning_rate": 2.341237192602864e-06, "loss": 0.4876, "step": 913 }, { "epoch": 7.811965811965812, "grad_norm": 1.425767091915281, "learning_rate": 2.336771279288427e-06, "loss": 0.2407, "step": 914 }, { "epoch": 7.82051282051282, "grad_norm": 1.8197170307382808, "learning_rate": 2.332305889022494e-06, "loss": 0.2812, "step": 915 }, { "epoch": 7.829059829059829, "grad_norm": 2.0226814905047688, "learning_rate": 2.3278410361139198e-06, "loss": 0.4093, "step": 916 }, { "epoch": 7.837606837606837, "grad_norm": 1.5227436667459866, "learning_rate": 2.3233767348698314e-06, "loss": 0.3748, "step": 917 }, { "epoch": 7.846153846153846, "grad_norm": 0.9875096286316338, "learning_rate": 2.3189129995955944e-06, "loss": 0.3997, "step": 918 }, { "epoch": 7.854700854700854, "grad_norm": 1.5962327497251578, "learning_rate": 2.314449844594758e-06, "loss": 0.4961, "step": 919 }, { "epoch": 7.863247863247864, "grad_norm": 1.182953335931338, "learning_rate": 2.3099872841690105e-06, "loss": 0.4962, "step": 920 }, { "epoch": 7.871794871794872, "grad_norm": 1.638523718350465, "learning_rate": 2.305525332618138e-06, "loss": 0.4412, "step": 921 }, { "epoch": 7.880341880341881, "grad_norm": 1.4337420318321525, "learning_rate": 2.3010640042399748e-06, "loss": 0.43, "step": 922 }, { "epoch": 7.888888888888889, "grad_norm": 1.670604048799414, "learning_rate": 2.296603313330355e-06, "loss": 0.263, "step": 923 }, { "epoch": 7.897435897435898, "grad_norm": 1.1485379775019051, "learning_rate": 2.2921432741830744e-06, "loss": 0.4643, "step": 924 }, { "epoch": 7.905982905982906, "grad_norm": 2.867980041283883, "learning_rate": 2.2876839010898377e-06, "loss": 0.2376, "step": 925 }, { "epoch": 7.914529914529915, "grad_norm": 1.168055991443902, "learning_rate": 2.283225208340218e-06, "loss": 0.3508, "step": 926 }, { "epoch": 7.923076923076923, "grad_norm": 1.3338717397882187, "learning_rate": 2.2787672102216045e-06, "loss": 0.3976, "step": 927 }, { "epoch": 7.931623931623932, "grad_norm": 1.2019336080154466, "learning_rate": 2.2743099210191623e-06, "loss": 0.6231, "step": 928 }, { "epoch": 7.94017094017094, "grad_norm": 1.6402278408284279, "learning_rate": 2.2698533550157865e-06, "loss": 0.3036, "step": 929 }, { "epoch": 7.948717948717949, "grad_norm": 1.7450291076447808, "learning_rate": 2.265397526492052e-06, "loss": 0.3265, "step": 930 }, { "epoch": 7.957264957264957, "grad_norm": 1.2116360109877196, "learning_rate": 2.2609424497261723e-06, "loss": 0.3962, "step": 931 }, { "epoch": 7.965811965811966, "grad_norm": 1.504059791655357, "learning_rate": 2.2564881389939524e-06, "loss": 0.328, "step": 932 }, { "epoch": 7.9743589743589745, "grad_norm": 1.7027076220105453, "learning_rate": 2.25203460856874e-06, "loss": 0.2808, "step": 933 }, { "epoch": 7.982905982905983, "grad_norm": 1.3865934975804255, "learning_rate": 2.2475818727213843e-06, "loss": 0.2991, "step": 934 }, { "epoch": 7.9914529914529915, "grad_norm": 1.4673027031590777, "learning_rate": 2.24312994572019e-06, "loss": 0.3486, "step": 935 }, { "epoch": 8.0, "grad_norm": 1.049108221216376, "learning_rate": 2.238678841830867e-06, "loss": 0.4932, "step": 936 }, { "epoch": 8.008547008547009, "grad_norm": 1.3551983825674456, "learning_rate": 2.2342285753164876e-06, "loss": 0.2734, "step": 937 }, { "epoch": 8.017094017094017, "grad_norm": 1.4182004282586038, "learning_rate": 2.2297791604374443e-06, "loss": 0.3156, "step": 938 }, { "epoch": 8.025641025641026, "grad_norm": 1.0864965117814793, "learning_rate": 2.2253306114513964e-06, "loss": 0.2765, "step": 939 }, { "epoch": 8.034188034188034, "grad_norm": 1.2703882083327591, "learning_rate": 2.220882942613231e-06, "loss": 0.4824, "step": 940 }, { "epoch": 8.042735042735043, "grad_norm": 1.526912834087006, "learning_rate": 2.2164361681750148e-06, "loss": 0.3501, "step": 941 }, { "epoch": 8.051282051282051, "grad_norm": 2.4233778444054255, "learning_rate": 2.2119903023859475e-06, "loss": 0.2769, "step": 942 }, { "epoch": 8.05982905982906, "grad_norm": 1.4817064190089302, "learning_rate": 2.2075453594923175e-06, "loss": 0.564, "step": 943 }, { "epoch": 8.068376068376068, "grad_norm": 1.7198698064027493, "learning_rate": 2.2031013537374564e-06, "loss": 0.5413, "step": 944 }, { "epoch": 8.076923076923077, "grad_norm": 1.4476738121666382, "learning_rate": 2.1986582993616926e-06, "loss": 0.2316, "step": 945 }, { "epoch": 8.085470085470085, "grad_norm": 1.1074691461600432, "learning_rate": 2.194216210602305e-06, "loss": 0.3108, "step": 946 }, { "epoch": 8.094017094017094, "grad_norm": 1.834202942421801, "learning_rate": 2.1897751016934802e-06, "loss": 0.3146, "step": 947 }, { "epoch": 8.102564102564102, "grad_norm": 1.2978798214303746, "learning_rate": 2.1853349868662637e-06, "loss": 0.186, "step": 948 }, { "epoch": 8.11111111111111, "grad_norm": 1.3117945103185198, "learning_rate": 2.1808958803485134e-06, "loss": 0.3242, "step": 949 }, { "epoch": 8.11965811965812, "grad_norm": 1.6686535455884859, "learning_rate": 2.1764577963648616e-06, "loss": 0.4344, "step": 950 }, { "epoch": 8.128205128205128, "grad_norm": 1.3972878519691447, "learning_rate": 2.1720207491366598e-06, "loss": 0.3, "step": 951 }, { "epoch": 8.136752136752136, "grad_norm": 1.1625677465463464, "learning_rate": 2.167584752881937e-06, "loss": 0.2146, "step": 952 }, { "epoch": 8.145299145299145, "grad_norm": 1.601301969292571, "learning_rate": 2.163149821815358e-06, "loss": 0.564, "step": 953 }, { "epoch": 8.153846153846153, "grad_norm": 1.224853456198565, "learning_rate": 2.1587159701481718e-06, "loss": 0.5251, "step": 954 }, { "epoch": 8.162393162393162, "grad_norm": 1.357992600450483, "learning_rate": 2.154283212088168e-06, "loss": 0.4969, "step": 955 }, { "epoch": 8.17094017094017, "grad_norm": 1.426670407162958, "learning_rate": 2.1498515618396327e-06, "loss": 0.2752, "step": 956 }, { "epoch": 8.179487179487179, "grad_norm": 1.1704916649264339, "learning_rate": 2.145421033603304e-06, "loss": 0.1518, "step": 957 }, { "epoch": 8.188034188034187, "grad_norm": 1.8264402097576982, "learning_rate": 2.1409916415763216e-06, "loss": 0.2544, "step": 958 }, { "epoch": 8.196581196581196, "grad_norm": 1.4744665660554996, "learning_rate": 2.1365633999521852e-06, "loss": 0.302, "step": 959 }, { "epoch": 8.205128205128204, "grad_norm": 1.1587143039082641, "learning_rate": 2.1321363229207097e-06, "loss": 0.3356, "step": 960 }, { "epoch": 8.213675213675213, "grad_norm": 1.1394161124337572, "learning_rate": 2.127710424667978e-06, "loss": 0.3758, "step": 961 }, { "epoch": 8.222222222222221, "grad_norm": 1.61420409353586, "learning_rate": 2.1232857193762923e-06, "loss": 0.237, "step": 962 }, { "epoch": 8.23076923076923, "grad_norm": 1.3412922365080107, "learning_rate": 2.1188622212241366e-06, "loss": 0.2108, "step": 963 }, { "epoch": 8.239316239316238, "grad_norm": 1.220900622715855, "learning_rate": 2.114439944386125e-06, "loss": 0.4871, "step": 964 }, { "epoch": 8.247863247863247, "grad_norm": 1.1681317060272036, "learning_rate": 2.1100189030329557e-06, "loss": 0.4729, "step": 965 }, { "epoch": 8.256410256410255, "grad_norm": 2.148611373627357, "learning_rate": 2.105599111331372e-06, "loss": 0.2583, "step": 966 }, { "epoch": 8.264957264957266, "grad_norm": 1.4116678905851439, "learning_rate": 2.101180583444111e-06, "loss": 0.3968, "step": 967 }, { "epoch": 8.273504273504274, "grad_norm": 1.1484639753245225, "learning_rate": 2.0967633335298583e-06, "loss": 0.4482, "step": 968 }, { "epoch": 8.282051282051283, "grad_norm": 1.0727263766010298, "learning_rate": 2.0923473757432073e-06, "loss": 0.2581, "step": 969 }, { "epoch": 8.290598290598291, "grad_norm": 1.2886191668311129, "learning_rate": 2.0879327242346096e-06, "loss": 0.4631, "step": 970 }, { "epoch": 8.2991452991453, "grad_norm": 1.2055035364480693, "learning_rate": 2.0835193931503297e-06, "loss": 0.3323, "step": 971 }, { "epoch": 8.307692307692308, "grad_norm": 1.7152947572112152, "learning_rate": 2.079107396632404e-06, "loss": 0.2037, "step": 972 }, { "epoch": 8.316239316239317, "grad_norm": 1.1141021797533694, "learning_rate": 2.0746967488185903e-06, "loss": 0.5559, "step": 973 }, { "epoch": 8.324786324786325, "grad_norm": 1.1459993656130663, "learning_rate": 2.0702874638423233e-06, "loss": 0.3579, "step": 974 }, { "epoch": 8.333333333333334, "grad_norm": 0.9871984116056022, "learning_rate": 2.0658795558326745e-06, "loss": 0.3504, "step": 975 }, { "epoch": 8.341880341880342, "grad_norm": 1.126408210461535, "learning_rate": 2.0614730389143004e-06, "loss": 0.2381, "step": 976 }, { "epoch": 8.350427350427351, "grad_norm": 1.5279937922363667, "learning_rate": 2.057067927207399e-06, "loss": 0.3458, "step": 977 }, { "epoch": 8.35897435897436, "grad_norm": 1.3740534569502523, "learning_rate": 2.052664234827668e-06, "loss": 0.454, "step": 978 }, { "epoch": 8.367521367521368, "grad_norm": 1.141519779977136, "learning_rate": 2.048261975886256e-06, "loss": 0.5811, "step": 979 }, { "epoch": 8.376068376068377, "grad_norm": 1.6726257371835274, "learning_rate": 2.0438611644897186e-06, "loss": 0.2511, "step": 980 }, { "epoch": 8.384615384615385, "grad_norm": 1.1118517949507838, "learning_rate": 2.0394618147399713e-06, "loss": 0.2559, "step": 981 }, { "epoch": 8.393162393162394, "grad_norm": 1.391439451118074, "learning_rate": 2.0350639407342474e-06, "loss": 0.2722, "step": 982 }, { "epoch": 8.401709401709402, "grad_norm": 1.481006180446899, "learning_rate": 2.030667556565052e-06, "loss": 0.2, "step": 983 }, { "epoch": 8.41025641025641, "grad_norm": 1.369017228519409, "learning_rate": 2.026272676320114e-06, "loss": 0.5728, "step": 984 }, { "epoch": 8.418803418803419, "grad_norm": 1.0477501912155225, "learning_rate": 2.021879314082344e-06, "loss": 0.267, "step": 985 }, { "epoch": 8.427350427350428, "grad_norm": 1.536820104698817, "learning_rate": 2.0174874839297912e-06, "loss": 0.3918, "step": 986 }, { "epoch": 8.435897435897436, "grad_norm": 1.270640833707206, "learning_rate": 2.01309719993559e-06, "loss": 0.4445, "step": 987 }, { "epoch": 8.444444444444445, "grad_norm": 1.3874319242758286, "learning_rate": 2.0087084761679245e-06, "loss": 0.364, "step": 988 }, { "epoch": 8.452991452991453, "grad_norm": 0.9453557375907675, "learning_rate": 2.0043213266899787e-06, "loss": 0.3354, "step": 989 }, { "epoch": 8.461538461538462, "grad_norm": 1.2460934799338634, "learning_rate": 1.9999357655598894e-06, "loss": 0.6679, "step": 990 }, { "epoch": 8.47008547008547, "grad_norm": 1.1321817356132304, "learning_rate": 1.995551806830706e-06, "loss": 0.3041, "step": 991 }, { "epoch": 8.478632478632479, "grad_norm": 1.535280683567388, "learning_rate": 1.9911694645503443e-06, "loss": 0.2659, "step": 992 }, { "epoch": 8.487179487179487, "grad_norm": 1.5817499556332901, "learning_rate": 1.986788752761536e-06, "loss": 0.4388, "step": 993 }, { "epoch": 8.495726495726496, "grad_norm": 1.63978371233733, "learning_rate": 1.9824096855017922e-06, "loss": 0.5096, "step": 994 }, { "epoch": 8.504273504273504, "grad_norm": 1.1861751601896062, "learning_rate": 1.978032276803354e-06, "loss": 0.4638, "step": 995 }, { "epoch": 8.512820512820513, "grad_norm": 1.2713512650546797, "learning_rate": 1.9736565406931446e-06, "loss": 0.3673, "step": 996 }, { "epoch": 8.521367521367521, "grad_norm": 1.1297283147346528, "learning_rate": 1.969282491192731e-06, "loss": 0.4807, "step": 997 }, { "epoch": 8.52991452991453, "grad_norm": 1.5692569320214733, "learning_rate": 1.9649101423182732e-06, "loss": 0.3172, "step": 998 }, { "epoch": 8.538461538461538, "grad_norm": 1.2481351983080389, "learning_rate": 1.960539508080485e-06, "loss": 0.4857, "step": 999 }, { "epoch": 8.547008547008547, "grad_norm": 1.1121259008864515, "learning_rate": 1.956170602484582e-06, "loss": 0.3605, "step": 1000 }, { "epoch": 8.555555555555555, "grad_norm": 1.0820395065466846, "learning_rate": 1.9518034395302413e-06, "loss": 0.5152, "step": 1001 }, { "epoch": 8.564102564102564, "grad_norm": 1.2170335969444013, "learning_rate": 1.94743803321156e-06, "loss": 0.1918, "step": 1002 }, { "epoch": 8.572649572649572, "grad_norm": 1.0768390804628318, "learning_rate": 1.9430743975170004e-06, "loss": 0.2212, "step": 1003 }, { "epoch": 8.581196581196581, "grad_norm": 1.1801978281526193, "learning_rate": 1.938712546429354e-06, "loss": 0.5013, "step": 1004 }, { "epoch": 8.58974358974359, "grad_norm": 1.2799936871315807, "learning_rate": 1.934352493925695e-06, "loss": 0.5044, "step": 1005 }, { "epoch": 8.598290598290598, "grad_norm": 1.1419776158740154, "learning_rate": 1.9299942539773316e-06, "loss": 0.2284, "step": 1006 }, { "epoch": 8.606837606837606, "grad_norm": 2.581226079905047, "learning_rate": 1.925637840549764e-06, "loss": 0.1372, "step": 1007 }, { "epoch": 8.615384615384615, "grad_norm": 1.4623611578834776, "learning_rate": 1.921283267602643e-06, "loss": 0.4478, "step": 1008 }, { "epoch": 8.623931623931623, "grad_norm": 2.0074150173839462, "learning_rate": 1.9169305490897173e-06, "loss": 0.3933, "step": 1009 }, { "epoch": 8.632478632478632, "grad_norm": 1.2589084565168156, "learning_rate": 1.9125796989587947e-06, "loss": 0.4317, "step": 1010 }, { "epoch": 8.64102564102564, "grad_norm": 1.1271801947834847, "learning_rate": 1.9082307311516985e-06, "loss": 0.4162, "step": 1011 }, { "epoch": 8.649572649572649, "grad_norm": 1.3583073699164292, "learning_rate": 1.9038836596042174e-06, "loss": 0.269, "step": 1012 }, { "epoch": 8.658119658119658, "grad_norm": 1.0956300687748248, "learning_rate": 1.8995384982460636e-06, "loss": 0.4542, "step": 1013 }, { "epoch": 8.666666666666666, "grad_norm": 1.411171017892716, "learning_rate": 1.895195261000831e-06, "loss": 0.2594, "step": 1014 }, { "epoch": 8.675213675213675, "grad_norm": 1.37127752697407, "learning_rate": 1.8908539617859455e-06, "loss": 0.2103, "step": 1015 }, { "epoch": 8.683760683760683, "grad_norm": 1.4011308842889736, "learning_rate": 1.8865146145126228e-06, "loss": 0.3916, "step": 1016 }, { "epoch": 8.692307692307692, "grad_norm": 1.784784106305036, "learning_rate": 1.8821772330858259e-06, "loss": 0.2464, "step": 1017 }, { "epoch": 8.7008547008547, "grad_norm": 1.4568451060436804, "learning_rate": 1.877841831404217e-06, "loss": 0.2571, "step": 1018 }, { "epoch": 8.709401709401709, "grad_norm": 1.2541641895523783, "learning_rate": 1.873508423360113e-06, "loss": 0.3484, "step": 1019 }, { "epoch": 8.717948717948717, "grad_norm": 1.790760709218908, "learning_rate": 1.8691770228394458e-06, "loss": 0.6167, "step": 1020 }, { "epoch": 8.726495726495726, "grad_norm": 1.341451765077332, "learning_rate": 1.8648476437217117e-06, "loss": 0.3222, "step": 1021 }, { "epoch": 8.735042735042736, "grad_norm": 1.0739656070864179, "learning_rate": 1.8605202998799299e-06, "loss": 0.3204, "step": 1022 }, { "epoch": 8.743589743589745, "grad_norm": 1.017114133042732, "learning_rate": 1.8561950051805994e-06, "loss": 0.4186, "step": 1023 }, { "epoch": 8.752136752136753, "grad_norm": 1.107854491656353, "learning_rate": 1.8518717734836522e-06, "loss": 0.4001, "step": 1024 }, { "epoch": 8.760683760683762, "grad_norm": 1.2546206655532879, "learning_rate": 1.8475506186424075e-06, "loss": 0.495, "step": 1025 }, { "epoch": 8.76923076923077, "grad_norm": 1.3154930449197013, "learning_rate": 1.8432315545035328e-06, "loss": 0.2821, "step": 1026 }, { "epoch": 8.777777777777779, "grad_norm": 1.4975866241382854, "learning_rate": 1.8389145949069953e-06, "loss": 0.3722, "step": 1027 }, { "epoch": 8.786324786324787, "grad_norm": 1.2831039302902338, "learning_rate": 1.8345997536860154e-06, "loss": 0.3783, "step": 1028 }, { "epoch": 8.794871794871796, "grad_norm": 1.1980975018106128, "learning_rate": 1.83028704466703e-06, "loss": 0.2603, "step": 1029 }, { "epoch": 8.803418803418804, "grad_norm": 2.209593515869201, "learning_rate": 1.8259764816696413e-06, "loss": 0.3781, "step": 1030 }, { "epoch": 8.811965811965813, "grad_norm": 1.2656408207095478, "learning_rate": 1.8216680785065734e-06, "loss": 0.4635, "step": 1031 }, { "epoch": 8.820512820512821, "grad_norm": 2.1089565512836748, "learning_rate": 1.8173618489836315e-06, "loss": 0.2241, "step": 1032 }, { "epoch": 8.82905982905983, "grad_norm": 1.2447903375211662, "learning_rate": 1.813057806899656e-06, "loss": 0.4539, "step": 1033 }, { "epoch": 8.837606837606838, "grad_norm": 1.0153965458845877, "learning_rate": 1.8087559660464766e-06, "loss": 0.4477, "step": 1034 }, { "epoch": 8.846153846153847, "grad_norm": 3.9606458120630346, "learning_rate": 1.8044563402088686e-06, "loss": 0.3163, "step": 1035 }, { "epoch": 8.854700854700855, "grad_norm": 1.7429909204407186, "learning_rate": 1.800158943164512e-06, "loss": 0.3911, "step": 1036 }, { "epoch": 8.863247863247864, "grad_norm": 1.509325256391988, "learning_rate": 1.7958637886839437e-06, "loss": 0.1908, "step": 1037 }, { "epoch": 8.871794871794872, "grad_norm": 1.3551384998854423, "learning_rate": 1.7915708905305124e-06, "loss": 0.4373, "step": 1038 }, { "epoch": 8.88034188034188, "grad_norm": 1.215439008720899, "learning_rate": 1.7872802624603408e-06, "loss": 0.4194, "step": 1039 }, { "epoch": 8.88888888888889, "grad_norm": 2.209864557912605, "learning_rate": 1.7829919182222752e-06, "loss": 0.3788, "step": 1040 }, { "epoch": 8.897435897435898, "grad_norm": 1.626332495031749, "learning_rate": 1.7787058715578415e-06, "loss": 0.4146, "step": 1041 }, { "epoch": 8.905982905982906, "grad_norm": 1.1918691292663193, "learning_rate": 1.7744221362012075e-06, "loss": 0.3042, "step": 1042 }, { "epoch": 8.914529914529915, "grad_norm": 1.2146958818698328, "learning_rate": 1.7701407258791323e-06, "loss": 0.4153, "step": 1043 }, { "epoch": 8.923076923076923, "grad_norm": 1.0513765409468407, "learning_rate": 1.7658616543109237e-06, "loss": 0.3588, "step": 1044 }, { "epoch": 8.931623931623932, "grad_norm": 1.3919842380301217, "learning_rate": 1.7615849352083975e-06, "loss": 0.4623, "step": 1045 }, { "epoch": 8.94017094017094, "grad_norm": 1.6990923788284649, "learning_rate": 1.7573105822758307e-06, "loss": 0.1482, "step": 1046 }, { "epoch": 8.948717948717949, "grad_norm": 1.5913449890668718, "learning_rate": 1.753038609209916e-06, "loss": 0.3842, "step": 1047 }, { "epoch": 8.957264957264957, "grad_norm": 1.3328131098581075, "learning_rate": 1.7487690296997234e-06, "loss": 0.3021, "step": 1048 }, { "epoch": 8.965811965811966, "grad_norm": 1.6596224898436513, "learning_rate": 1.7445018574266514e-06, "loss": 0.2747, "step": 1049 }, { "epoch": 8.974358974358974, "grad_norm": 1.4417679558155323, "learning_rate": 1.740237106064383e-06, "loss": 0.3087, "step": 1050 }, { "epoch": 8.982905982905983, "grad_norm": 1.2001346733735734, "learning_rate": 1.7359747892788476e-06, "loss": 0.4016, "step": 1051 }, { "epoch": 8.991452991452991, "grad_norm": 1.1721123567379175, "learning_rate": 1.7317149207281697e-06, "loss": 0.202, "step": 1052 }, { "epoch": 9.0, "grad_norm": 1.2206194854441548, "learning_rate": 1.7274575140626318e-06, "loss": 0.5314, "step": 1053 }, { "epoch": 9.008547008547009, "grad_norm": 1.9473535963527984, "learning_rate": 1.723202582924624e-06, "loss": 0.3228, "step": 1054 }, { "epoch": 9.017094017094017, "grad_norm": 1.288253715894709, "learning_rate": 1.7189501409486061e-06, "loss": 0.3262, "step": 1055 }, { "epoch": 9.025641025641026, "grad_norm": 1.015888824030439, "learning_rate": 1.7147002017610626e-06, "loss": 0.3223, "step": 1056 }, { "epoch": 9.034188034188034, "grad_norm": 1.1877903780566137, "learning_rate": 1.7104527789804554e-06, "loss": 0.397, "step": 1057 }, { "epoch": 9.042735042735043, "grad_norm": 1.1021032536214928, "learning_rate": 1.7062078862171838e-06, "loss": 0.2936, "step": 1058 }, { "epoch": 9.051282051282051, "grad_norm": 0.9267012407122549, "learning_rate": 1.7019655370735428e-06, "loss": 0.3594, "step": 1059 }, { "epoch": 9.05982905982906, "grad_norm": 1.42199614285682, "learning_rate": 1.6977257451436714e-06, "loss": 0.5004, "step": 1060 }, { "epoch": 9.068376068376068, "grad_norm": 1.5329574926292904, "learning_rate": 1.6934885240135179e-06, "loss": 0.2812, "step": 1061 }, { "epoch": 9.076923076923077, "grad_norm": 1.9955114217965897, "learning_rate": 1.6892538872607936e-06, "loss": 0.2585, "step": 1062 }, { "epoch": 9.085470085470085, "grad_norm": 1.2354971874595881, "learning_rate": 1.6850218484549247e-06, "loss": 0.3017, "step": 1063 }, { "epoch": 9.094017094017094, "grad_norm": 1.2906710359133946, "learning_rate": 1.6807924211570151e-06, "loss": 0.5348, "step": 1064 }, { "epoch": 9.102564102564102, "grad_norm": 1.3827715724003555, "learning_rate": 1.6765656189198013e-06, "loss": 0.3824, "step": 1065 }, { "epoch": 9.11111111111111, "grad_norm": 1.7857256227411515, "learning_rate": 1.6723414552876052e-06, "loss": 0.1725, "step": 1066 }, { "epoch": 9.11965811965812, "grad_norm": 1.4869088752229858, "learning_rate": 1.6681199437962952e-06, "loss": 0.2514, "step": 1067 }, { "epoch": 9.128205128205128, "grad_norm": 1.2481499309084931, "learning_rate": 1.663901097973243e-06, "loss": 0.5908, "step": 1068 }, { "epoch": 9.136752136752136, "grad_norm": 1.1671070517620592, "learning_rate": 1.6596849313372737e-06, "loss": 0.3567, "step": 1069 }, { "epoch": 9.145299145299145, "grad_norm": 1.3165208006094509, "learning_rate": 1.6554714573986325e-06, "loss": 0.4676, "step": 1070 }, { "epoch": 9.153846153846153, "grad_norm": 1.5518295391328731, "learning_rate": 1.6512606896589323e-06, "loss": 0.5293, "step": 1071 }, { "epoch": 9.162393162393162, "grad_norm": 1.131322170174984, "learning_rate": 1.647052641611117e-06, "loss": 0.5835, "step": 1072 }, { "epoch": 9.17094017094017, "grad_norm": 1.3950148174594257, "learning_rate": 1.6428473267394135e-06, "loss": 0.3448, "step": 1073 }, { "epoch": 9.179487179487179, "grad_norm": 1.0946896683070888, "learning_rate": 1.6386447585192911e-06, "loss": 0.3427, "step": 1074 }, { "epoch": 9.188034188034187, "grad_norm": 1.3179053377437275, "learning_rate": 1.6344449504174193e-06, "loss": 0.361, "step": 1075 }, { "epoch": 9.196581196581196, "grad_norm": 1.294239250413899, "learning_rate": 1.63024791589162e-06, "loss": 0.2773, "step": 1076 }, { "epoch": 9.205128205128204, "grad_norm": 2.464763503764214, "learning_rate": 1.6260536683908302e-06, "loss": 0.3248, "step": 1077 }, { "epoch": 9.213675213675213, "grad_norm": 1.6874344831226797, "learning_rate": 1.621862221355056e-06, "loss": 0.3882, "step": 1078 }, { "epoch": 9.222222222222221, "grad_norm": 1.4390246409930711, "learning_rate": 1.6176735882153284e-06, "loss": 0.4268, "step": 1079 }, { "epoch": 9.23076923076923, "grad_norm": 1.198697926754215, "learning_rate": 1.613487782393661e-06, "loss": 0.3663, "step": 1080 }, { "epoch": 9.239316239316238, "grad_norm": 1.8995745824605574, "learning_rate": 1.6093048173030108e-06, "loss": 0.2581, "step": 1081 }, { "epoch": 9.247863247863247, "grad_norm": 1.0568489463788437, "learning_rate": 1.6051247063472286e-06, "loss": 0.3077, "step": 1082 }, { "epoch": 9.256410256410255, "grad_norm": 1.179620148126396, "learning_rate": 1.6009474629210202e-06, "loss": 0.3039, "step": 1083 }, { "epoch": 9.264957264957266, "grad_norm": 2.3385119254410247, "learning_rate": 1.5967731004099057e-06, "loss": 0.2822, "step": 1084 }, { "epoch": 9.273504273504274, "grad_norm": 1.4098002122577362, "learning_rate": 1.5926016321901688e-06, "loss": 0.2923, "step": 1085 }, { "epoch": 9.282051282051283, "grad_norm": 1.2847130932530688, "learning_rate": 1.5884330716288215e-06, "loss": 0.3617, "step": 1086 }, { "epoch": 9.290598290598291, "grad_norm": 1.1205572597044093, "learning_rate": 1.5842674320835598e-06, "loss": 0.1306, "step": 1087 }, { "epoch": 9.2991452991453, "grad_norm": 1.067879852287505, "learning_rate": 1.5801047269027164e-06, "loss": 0.2681, "step": 1088 }, { "epoch": 9.307692307692308, "grad_norm": 1.2697754842885247, "learning_rate": 1.5759449694252226e-06, "loss": 0.3835, "step": 1089 }, { "epoch": 9.316239316239317, "grad_norm": 1.115652242318011, "learning_rate": 1.571788172980566e-06, "loss": 0.3174, "step": 1090 }, { "epoch": 9.324786324786325, "grad_norm": 1.2589435696480922, "learning_rate": 1.567634350888743e-06, "loss": 0.2131, "step": 1091 }, { "epoch": 9.333333333333334, "grad_norm": 1.1703746042658, "learning_rate": 1.56348351646022e-06, "loss": 0.2466, "step": 1092 }, { "epoch": 9.341880341880342, "grad_norm": 1.8514476531000787, "learning_rate": 1.5593356829958906e-06, "loss": 0.4028, "step": 1093 }, { "epoch": 9.350427350427351, "grad_norm": 1.830700341859361, "learning_rate": 1.5551908637870316e-06, "loss": 0.424, "step": 1094 }, { "epoch": 9.35897435897436, "grad_norm": 1.2740235245585687, "learning_rate": 1.5510490721152594e-06, "loss": 0.3676, "step": 1095 }, { "epoch": 9.367521367521368, "grad_norm": 1.8244846681767757, "learning_rate": 1.5469103212524917e-06, "loss": 0.2519, "step": 1096 }, { "epoch": 9.376068376068377, "grad_norm": 1.1653789385736504, "learning_rate": 1.5427746244609015e-06, "loss": 0.4327, "step": 1097 }, { "epoch": 9.384615384615385, "grad_norm": 1.6539684322688117, "learning_rate": 1.5386419949928732e-06, "loss": 0.4085, "step": 1098 }, { "epoch": 9.393162393162394, "grad_norm": 1.1085990250805047, "learning_rate": 1.534512446090965e-06, "loss": 0.0987, "step": 1099 }, { "epoch": 9.401709401709402, "grad_norm": 1.1240053164248822, "learning_rate": 1.5303859909878632e-06, "loss": 0.2357, "step": 1100 }, { "epoch": 9.41025641025641, "grad_norm": 1.4361555001257007, "learning_rate": 1.5262626429063385e-06, "loss": 0.2324, "step": 1101 }, { "epoch": 9.418803418803419, "grad_norm": 1.4567991642591915, "learning_rate": 1.5221424150592078e-06, "loss": 0.1904, "step": 1102 }, { "epoch": 9.427350427350428, "grad_norm": 0.9078527832107429, "learning_rate": 1.518025320649289e-06, "loss": 0.3208, "step": 1103 }, { "epoch": 9.435897435897436, "grad_norm": 0.961997474880962, "learning_rate": 1.5139113728693575e-06, "loss": 0.2273, "step": 1104 }, { "epoch": 9.444444444444445, "grad_norm": 1.2170940431358712, "learning_rate": 1.509800584902108e-06, "loss": 0.1865, "step": 1105 }, { "epoch": 9.452991452991453, "grad_norm": 1.0896252626201706, "learning_rate": 1.5056929699201095e-06, "loss": 0.3603, "step": 1106 }, { "epoch": 9.461538461538462, "grad_norm": 1.2969179051429869, "learning_rate": 1.5015885410857617e-06, "loss": 0.2665, "step": 1107 }, { "epoch": 9.47008547008547, "grad_norm": 1.3077008732044582, "learning_rate": 1.4974873115512561e-06, "loss": 0.2858, "step": 1108 }, { "epoch": 9.478632478632479, "grad_norm": 1.5312364963489444, "learning_rate": 1.4933892944585331e-06, "loss": 0.3289, "step": 1109 }, { "epoch": 9.487179487179487, "grad_norm": 1.4001962940256754, "learning_rate": 1.489294502939238e-06, "loss": 0.1762, "step": 1110 }, { "epoch": 9.495726495726496, "grad_norm": 1.6856080795603618, "learning_rate": 1.4852029501146797e-06, "loss": 0.2352, "step": 1111 }, { "epoch": 9.504273504273504, "grad_norm": 1.7737389148061027, "learning_rate": 1.4811146490957903e-06, "loss": 0.4691, "step": 1112 }, { "epoch": 9.512820512820513, "grad_norm": 1.0477925330006121, "learning_rate": 1.477029612983082e-06, "loss": 0.3916, "step": 1113 }, { "epoch": 9.521367521367521, "grad_norm": 1.1845005270835856, "learning_rate": 1.4729478548666027e-06, "loss": 0.2385, "step": 1114 }, { "epoch": 9.52991452991453, "grad_norm": 1.1819763888213934, "learning_rate": 1.468869387825899e-06, "loss": 0.5128, "step": 1115 }, { "epoch": 9.538461538461538, "grad_norm": 1.0536988938300638, "learning_rate": 1.4647942249299708e-06, "loss": 0.2344, "step": 1116 }, { "epoch": 9.547008547008547, "grad_norm": 1.9011902118681425, "learning_rate": 1.4607223792372283e-06, "loss": 0.1493, "step": 1117 }, { "epoch": 9.555555555555555, "grad_norm": 1.3143466965397048, "learning_rate": 1.4566538637954556e-06, "loss": 0.6493, "step": 1118 }, { "epoch": 9.564102564102564, "grad_norm": 1.6921258588793364, "learning_rate": 1.452588691641763e-06, "loss": 0.5337, "step": 1119 }, { "epoch": 9.572649572649572, "grad_norm": 1.1271554002907007, "learning_rate": 1.4485268758025467e-06, "loss": 0.585, "step": 1120 }, { "epoch": 9.581196581196581, "grad_norm": 1.6575284887466888, "learning_rate": 1.4444684292934508e-06, "loss": 0.3152, "step": 1121 }, { "epoch": 9.58974358974359, "grad_norm": 1.1730847533193425, "learning_rate": 1.4404133651193214e-06, "loss": 0.475, "step": 1122 }, { "epoch": 9.598290598290598, "grad_norm": 1.1143229523313662, "learning_rate": 1.436361696274166e-06, "loss": 0.3746, "step": 1123 }, { "epoch": 9.606837606837606, "grad_norm": 1.4579201342685206, "learning_rate": 1.4323134357411114e-06, "loss": 0.2438, "step": 1124 }, { "epoch": 9.615384615384615, "grad_norm": 1.224526950260683, "learning_rate": 1.4282685964923643e-06, "loss": 0.2409, "step": 1125 }, { "epoch": 9.623931623931623, "grad_norm": 1.9183943687319494, "learning_rate": 1.4242271914891688e-06, "loss": 0.2619, "step": 1126 }, { "epoch": 9.632478632478632, "grad_norm": 1.448929760659896, "learning_rate": 1.4201892336817616e-06, "loss": 0.3426, "step": 1127 }, { "epoch": 9.64102564102564, "grad_norm": 1.2013122019818248, "learning_rate": 1.4161547360093364e-06, "loss": 0.4859, "step": 1128 }, { "epoch": 9.649572649572649, "grad_norm": 1.0596559301713948, "learning_rate": 1.4121237113999975e-06, "loss": 0.3119, "step": 1129 }, { "epoch": 9.658119658119658, "grad_norm": 1.4855292910426476, "learning_rate": 1.4080961727707185e-06, "loss": 0.2595, "step": 1130 }, { "epoch": 9.666666666666666, "grad_norm": 1.2478839118441194, "learning_rate": 1.4040721330273063e-06, "loss": 0.3786, "step": 1131 }, { "epoch": 9.675213675213675, "grad_norm": 3.9119278747334416, "learning_rate": 1.4000516050643549e-06, "loss": 0.2005, "step": 1132 }, { "epoch": 9.683760683760683, "grad_norm": 1.2618989022566294, "learning_rate": 1.3960346017652027e-06, "loss": 0.2733, "step": 1133 }, { "epoch": 9.692307692307692, "grad_norm": 1.1495751469091657, "learning_rate": 1.3920211360018971e-06, "loss": 0.3475, "step": 1134 }, { "epoch": 9.7008547008547, "grad_norm": 1.154458217155745, "learning_rate": 1.3880112206351476e-06, "loss": 0.4554, "step": 1135 }, { "epoch": 9.709401709401709, "grad_norm": 1.4267903377941606, "learning_rate": 1.3840048685142863e-06, "loss": 0.2523, "step": 1136 }, { "epoch": 9.717948717948717, "grad_norm": 1.2324200322804886, "learning_rate": 1.3800020924772295e-06, "loss": 0.3087, "step": 1137 }, { "epoch": 9.726495726495726, "grad_norm": 2.119358735727275, "learning_rate": 1.3760029053504346e-06, "loss": 0.2448, "step": 1138 }, { "epoch": 9.735042735042736, "grad_norm": 1.7034025319599941, "learning_rate": 1.372007319948855e-06, "loss": 0.4434, "step": 1139 }, { "epoch": 9.743589743589745, "grad_norm": 1.4386385418392187, "learning_rate": 1.3680153490759074e-06, "loss": 0.1699, "step": 1140 }, { "epoch": 9.752136752136753, "grad_norm": 1.82898779224831, "learning_rate": 1.3640270055234227e-06, "loss": 0.3602, "step": 1141 }, { "epoch": 9.760683760683762, "grad_norm": 1.5364042446197623, "learning_rate": 1.360042302071609e-06, "loss": 0.3554, "step": 1142 }, { "epoch": 9.76923076923077, "grad_norm": 1.2751685256780956, "learning_rate": 1.356061251489012e-06, "loss": 0.5297, "step": 1143 }, { "epoch": 9.777777777777779, "grad_norm": 1.7282120128773697, "learning_rate": 1.3520838665324704e-06, "loss": 0.2374, "step": 1144 }, { "epoch": 9.786324786324787, "grad_norm": 1.3425257816234517, "learning_rate": 1.3481101599470794e-06, "loss": 0.2757, "step": 1145 }, { "epoch": 9.794871794871796, "grad_norm": 1.2699459087777707, "learning_rate": 1.3441401444661416e-06, "loss": 0.3513, "step": 1146 }, { "epoch": 9.803418803418804, "grad_norm": 1.1393292998903113, "learning_rate": 1.3401738328111374e-06, "loss": 0.2073, "step": 1147 }, { "epoch": 9.811965811965813, "grad_norm": 1.1088535968070063, "learning_rate": 1.336211237691678e-06, "loss": 0.235, "step": 1148 }, { "epoch": 9.820512820512821, "grad_norm": 1.0270552038970258, "learning_rate": 1.3322523718054615e-06, "loss": 0.3529, "step": 1149 }, { "epoch": 9.82905982905983, "grad_norm": 1.5610839594093011, "learning_rate": 1.328297247838241e-06, "loss": 0.4928, "step": 1150 }, { "epoch": 9.837606837606838, "grad_norm": 1.292327740738013, "learning_rate": 1.3243458784637763e-06, "loss": 0.4766, "step": 1151 }, { "epoch": 9.846153846153847, "grad_norm": 1.7880250892242995, "learning_rate": 1.320398276343795e-06, "loss": 0.3798, "step": 1152 }, { "epoch": 9.854700854700855, "grad_norm": 1.154221527037192, "learning_rate": 1.3164544541279555e-06, "loss": 0.4733, "step": 1153 }, { "epoch": 9.863247863247864, "grad_norm": 1.4767581712130635, "learning_rate": 1.3125144244538038e-06, "loss": 0.3272, "step": 1154 }, { "epoch": 9.871794871794872, "grad_norm": 1.1375141309454042, "learning_rate": 1.3085781999467303e-06, "loss": 0.1772, "step": 1155 }, { "epoch": 9.88034188034188, "grad_norm": 1.357707685043935, "learning_rate": 1.304645793219936e-06, "loss": 0.4535, "step": 1156 }, { "epoch": 9.88888888888889, "grad_norm": 1.8407787280927563, "learning_rate": 1.3007172168743854e-06, "loss": 0.2293, "step": 1157 }, { "epoch": 9.897435897435898, "grad_norm": 1.2944703737109389, "learning_rate": 1.2967924834987687e-06, "loss": 0.449, "step": 1158 }, { "epoch": 9.905982905982906, "grad_norm": 1.361304325226804, "learning_rate": 1.2928716056694637e-06, "loss": 0.238, "step": 1159 }, { "epoch": 9.914529914529915, "grad_norm": 1.2097243748345254, "learning_rate": 1.288954595950494e-06, "loss": 0.4441, "step": 1160 }, { "epoch": 9.923076923076923, "grad_norm": 1.4844650371837154, "learning_rate": 1.285041466893485e-06, "loss": 0.317, "step": 1161 }, { "epoch": 9.931623931623932, "grad_norm": 1.173403562434299, "learning_rate": 1.2811322310376303e-06, "loss": 0.3963, "step": 1162 }, { "epoch": 9.94017094017094, "grad_norm": 1.2779391510526092, "learning_rate": 1.2772269009096456e-06, "loss": 0.5377, "step": 1163 }, { "epoch": 9.948717948717949, "grad_norm": 1.224475576365551, "learning_rate": 1.2733254890237334e-06, "loss": 0.2121, "step": 1164 }, { "epoch": 9.957264957264957, "grad_norm": 1.4870462629125007, "learning_rate": 1.2694280078815382e-06, "loss": 0.3333, "step": 1165 }, { "epoch": 9.965811965811966, "grad_norm": 1.123284137727133, "learning_rate": 1.2655344699721111e-06, "loss": 0.1703, "step": 1166 }, { "epoch": 9.974358974358974, "grad_norm": 1.4953977023064586, "learning_rate": 1.2616448877718672e-06, "loss": 0.2374, "step": 1167 }, { "epoch": 9.982905982905983, "grad_norm": 1.322352923347006, "learning_rate": 1.257759273744545e-06, "loss": 0.497, "step": 1168 }, { "epoch": 9.991452991452991, "grad_norm": 1.3765362036635373, "learning_rate": 1.253877640341166e-06, "loss": 0.4128, "step": 1169 }, { "epoch": 10.0, "grad_norm": 1.2816541186687818, "learning_rate": 1.2500000000000007e-06, "loss": 0.1786, "step": 1170 }, { "epoch": 10.008547008547009, "grad_norm": 1.045651825933841, "learning_rate": 1.2461263651465194e-06, "loss": 0.4029, "step": 1171 }, { "epoch": 10.017094017094017, "grad_norm": 1.2489254469634197, "learning_rate": 1.2422567481933604e-06, "loss": 0.2414, "step": 1172 }, { "epoch": 10.025641025641026, "grad_norm": 1.358095126324313, "learning_rate": 1.2383911615402873e-06, "loss": 0.4616, "step": 1173 }, { "epoch": 10.034188034188034, "grad_norm": 0.941362841502233, "learning_rate": 1.2345296175741466e-06, "loss": 0.4066, "step": 1174 }, { "epoch": 10.042735042735043, "grad_norm": 1.1007821459044609, "learning_rate": 1.2306721286688312e-06, "loss": 0.1387, "step": 1175 }, { "epoch": 10.051282051282051, "grad_norm": 1.1176796389778227, "learning_rate": 1.226818707185242e-06, "loss": 0.2619, "step": 1176 }, { "epoch": 10.05982905982906, "grad_norm": 1.7579873340296106, "learning_rate": 1.2229693654712433e-06, "loss": 0.4547, "step": 1177 }, { "epoch": 10.068376068376068, "grad_norm": 1.7828786292695857, "learning_rate": 1.2191241158616284e-06, "loss": 0.2923, "step": 1178 }, { "epoch": 10.076923076923077, "grad_norm": 1.5328882990850015, "learning_rate": 1.2152829706780786e-06, "loss": 0.2028, "step": 1179 }, { "epoch": 10.085470085470085, "grad_norm": 2.4020071504961598, "learning_rate": 1.2114459422291205e-06, "loss": 0.4144, "step": 1180 }, { "epoch": 10.094017094017094, "grad_norm": 1.5205253379430628, "learning_rate": 1.2076130428100894e-06, "loss": 0.2097, "step": 1181 }, { "epoch": 10.102564102564102, "grad_norm": 1.3015048316112432, "learning_rate": 1.203784284703091e-06, "loss": 0.3175, "step": 1182 }, { "epoch": 10.11111111111111, "grad_norm": 1.1770114406108907, "learning_rate": 1.1999596801769617e-06, "loss": 0.3426, "step": 1183 }, { "epoch": 10.11965811965812, "grad_norm": 1.5961214307061742, "learning_rate": 1.196139241487225e-06, "loss": 0.1844, "step": 1184 }, { "epoch": 10.128205128205128, "grad_norm": 1.096054730084331, "learning_rate": 1.1923229808760565e-06, "loss": 0.2849, "step": 1185 }, { "epoch": 10.136752136752136, "grad_norm": 1.5756734459360875, "learning_rate": 1.1885109105722454e-06, "loss": 0.1369, "step": 1186 }, { "epoch": 10.145299145299145, "grad_norm": 1.141051374841538, "learning_rate": 1.184703042791151e-06, "loss": 0.2324, "step": 1187 }, { "epoch": 10.153846153846153, "grad_norm": 1.4524342108279535, "learning_rate": 1.1808993897346679e-06, "loss": 0.2859, "step": 1188 }, { "epoch": 10.162393162393162, "grad_norm": 1.0352571140427305, "learning_rate": 1.1770999635911857e-06, "loss": 0.2178, "step": 1189 }, { "epoch": 10.17094017094017, "grad_norm": 1.8559620220081188, "learning_rate": 1.1733047765355466e-06, "loss": 0.2326, "step": 1190 }, { "epoch": 10.179487179487179, "grad_norm": 1.6108485889953668, "learning_rate": 1.1695138407290101e-06, "loss": 0.2345, "step": 1191 }, { "epoch": 10.188034188034187, "grad_norm": 1.445694921851245, "learning_rate": 1.1657271683192156e-06, "loss": 0.5413, "step": 1192 }, { "epoch": 10.196581196581196, "grad_norm": 1.2294376484774858, "learning_rate": 1.1619447714401367e-06, "loss": 0.3702, "step": 1193 }, { "epoch": 10.205128205128204, "grad_norm": 1.2929066334757782, "learning_rate": 1.1581666622120494e-06, "loss": 0.5578, "step": 1194 }, { "epoch": 10.213675213675213, "grad_norm": 1.1179736099867603, "learning_rate": 1.154392852741491e-06, "loss": 0.5457, "step": 1195 }, { "epoch": 10.222222222222221, "grad_norm": 1.2468241550484034, "learning_rate": 1.1506233551212186e-06, "loss": 0.366, "step": 1196 }, { "epoch": 10.23076923076923, "grad_norm": 0.9296405737354689, "learning_rate": 1.1468581814301718e-06, "loss": 0.1819, "step": 1197 }, { "epoch": 10.239316239316238, "grad_norm": 1.4238388630812857, "learning_rate": 1.1430973437334375e-06, "loss": 0.4314, "step": 1198 }, { "epoch": 10.247863247863247, "grad_norm": 0.9823335539626711, "learning_rate": 1.1393408540822073e-06, "loss": 0.2993, "step": 1199 }, { "epoch": 10.256410256410255, "grad_norm": 1.6141313708327418, "learning_rate": 1.1355887245137383e-06, "loss": 0.3024, "step": 1200 }, { "epoch": 10.264957264957266, "grad_norm": 1.3995125132746562, "learning_rate": 1.1318409670513194e-06, "loss": 0.4031, "step": 1201 }, { "epoch": 10.273504273504274, "grad_norm": 1.1327941072732368, "learning_rate": 1.1280975937042263e-06, "loss": 0.2056, "step": 1202 }, { "epoch": 10.282051282051283, "grad_norm": 1.3090370221081955, "learning_rate": 1.1243586164676873e-06, "loss": 0.1986, "step": 1203 }, { "epoch": 10.290598290598291, "grad_norm": 1.19361641233488, "learning_rate": 1.1206240473228447e-06, "loss": 0.2859, "step": 1204 }, { "epoch": 10.2991452991453, "grad_norm": 1.1504227602499402, "learning_rate": 1.1168938982367162e-06, "loss": 0.3263, "step": 1205 }, { "epoch": 10.307692307692308, "grad_norm": 2.3225905553046715, "learning_rate": 1.1131681811621529e-06, "loss": 0.387, "step": 1206 }, { "epoch": 10.316239316239317, "grad_norm": 1.338744538066502, "learning_rate": 1.1094469080378076e-06, "loss": 0.1859, "step": 1207 }, { "epoch": 10.324786324786325, "grad_norm": 1.352869738451569, "learning_rate": 1.1057300907880904e-06, "loss": 0.2305, "step": 1208 }, { "epoch": 10.333333333333334, "grad_norm": 1.5052743404655706, "learning_rate": 1.1020177413231334e-06, "loss": 0.154, "step": 1209 }, { "epoch": 10.341880341880342, "grad_norm": 1.2799078598685383, "learning_rate": 1.0983098715387528e-06, "loss": 0.1347, "step": 1210 }, { "epoch": 10.350427350427351, "grad_norm": 1.4931972984521482, "learning_rate": 1.0946064933164117e-06, "loss": 0.4827, "step": 1211 }, { "epoch": 10.35897435897436, "grad_norm": 1.8359920629352564, "learning_rate": 1.0909076185231762e-06, "loss": 0.3361, "step": 1212 }, { "epoch": 10.367521367521368, "grad_norm": 0.9290479814316553, "learning_rate": 1.0872132590116866e-06, "loss": 0.2929, "step": 1213 }, { "epoch": 10.376068376068377, "grad_norm": 1.2663449027008966, "learning_rate": 1.0835234266201109e-06, "loss": 0.2174, "step": 1214 }, { "epoch": 10.384615384615385, "grad_norm": 2.0664563187789775, "learning_rate": 1.079838133172111e-06, "loss": 0.1486, "step": 1215 }, { "epoch": 10.393162393162394, "grad_norm": 1.0989776984155346, "learning_rate": 1.0761573904768054e-06, "loss": 0.1891, "step": 1216 }, { "epoch": 10.401709401709402, "grad_norm": 1.0543354481576128, "learning_rate": 1.0724812103287304e-06, "loss": 0.2999, "step": 1217 }, { "epoch": 10.41025641025641, "grad_norm": 1.0538412063008713, "learning_rate": 1.0688096045078023e-06, "loss": 0.1605, "step": 1218 }, { "epoch": 10.418803418803419, "grad_norm": 1.5967998881825511, "learning_rate": 1.0651425847792767e-06, "loss": 0.3104, "step": 1219 }, { "epoch": 10.427350427350428, "grad_norm": 1.138119287272601, "learning_rate": 1.061480162893716e-06, "loss": 0.1738, "step": 1220 }, { "epoch": 10.435897435897436, "grad_norm": 1.2348448641155363, "learning_rate": 1.0578223505869494e-06, "loss": 0.2955, "step": 1221 }, { "epoch": 10.444444444444445, "grad_norm": 2.5194715080813403, "learning_rate": 1.0541691595800338e-06, "loss": 0.485, "step": 1222 }, { "epoch": 10.452991452991453, "grad_norm": 1.0000721895885394, "learning_rate": 1.0505206015792194e-06, "loss": 0.4439, "step": 1223 }, { "epoch": 10.461538461538462, "grad_norm": 1.0958363947374086, "learning_rate": 1.0468766882759094e-06, "loss": 0.3602, "step": 1224 }, { "epoch": 10.47008547008547, "grad_norm": 1.0825014890114124, "learning_rate": 1.043237431346622e-06, "loss": 0.2051, "step": 1225 }, { "epoch": 10.478632478632479, "grad_norm": 1.4035959711749553, "learning_rate": 1.0396028424529578e-06, "loss": 0.2622, "step": 1226 }, { "epoch": 10.487179487179487, "grad_norm": 1.3152449508712105, "learning_rate": 1.0359729332415582e-06, "loss": 0.2951, "step": 1227 }, { "epoch": 10.495726495726496, "grad_norm": 1.298812306086358, "learning_rate": 1.032347715344067e-06, "loss": 0.5657, "step": 1228 }, { "epoch": 10.504273504273504, "grad_norm": 1.2384576946208885, "learning_rate": 1.0287272003770982e-06, "loss": 0.1613, "step": 1229 }, { "epoch": 10.512820512820513, "grad_norm": 1.074312238661994, "learning_rate": 1.0251113999421936e-06, "loss": 0.2647, "step": 1230 }, { "epoch": 10.521367521367521, "grad_norm": 1.595259592896357, "learning_rate": 1.0215003256257874e-06, "loss": 0.4389, "step": 1231 }, { "epoch": 10.52991452991453, "grad_norm": 1.8489731376397716, "learning_rate": 1.0178939889991717e-06, "loss": 0.2919, "step": 1232 }, { "epoch": 10.538461538461538, "grad_norm": 1.4870289686970262, "learning_rate": 1.014292401618457e-06, "loss": 0.2796, "step": 1233 }, { "epoch": 10.547008547008547, "grad_norm": 1.1764260707798178, "learning_rate": 1.0106955750245323e-06, "loss": 0.3266, "step": 1234 }, { "epoch": 10.555555555555555, "grad_norm": 1.068302905807488, "learning_rate": 1.0071035207430352e-06, "loss": 0.2251, "step": 1235 }, { "epoch": 10.564102564102564, "grad_norm": 1.6533940675412566, "learning_rate": 1.0035162502843073e-06, "loss": 0.6334, "step": 1236 }, { "epoch": 10.572649572649572, "grad_norm": 1.2461535990621175, "learning_rate": 9.999337751433643e-07, "loss": 0.3432, "step": 1237 }, { "epoch": 10.581196581196581, "grad_norm": 0.928022739303377, "learning_rate": 9.963561067998531e-07, "loss": 0.4515, "step": 1238 }, { "epoch": 10.58974358974359, "grad_norm": 0.9670006631983897, "learning_rate": 9.927832567180193e-07, "loss": 0.3008, "step": 1239 }, { "epoch": 10.598290598290598, "grad_norm": 1.3374900402459684, "learning_rate": 9.892152363466692e-07, "loss": 0.2875, "step": 1240 }, { "epoch": 10.606837606837606, "grad_norm": 1.9904354279349568, "learning_rate": 9.856520571191316e-07, "loss": 0.3453, "step": 1241 }, { "epoch": 10.615384615384615, "grad_norm": 1.1713430132865819, "learning_rate": 9.820937304532221e-07, "loss": 0.347, "step": 1242 }, { "epoch": 10.623931623931623, "grad_norm": 1.33830343768472, "learning_rate": 9.78540267751209e-07, "loss": 0.2877, "step": 1243 }, { "epoch": 10.632478632478632, "grad_norm": 1.3015970073302343, "learning_rate": 9.749916803997717e-07, "loss": 0.3081, "step": 1244 }, { "epoch": 10.64102564102564, "grad_norm": 1.1242361345286183, "learning_rate": 9.714479797699695e-07, "loss": 0.2623, "step": 1245 }, { "epoch": 10.649572649572649, "grad_norm": 1.154568783965454, "learning_rate": 9.679091772172021e-07, "loss": 0.2735, "step": 1246 }, { "epoch": 10.658119658119658, "grad_norm": 1.3468667577457005, "learning_rate": 9.643752840811734e-07, "loss": 0.4489, "step": 1247 }, { "epoch": 10.666666666666666, "grad_norm": 1.120164463622952, "learning_rate": 9.608463116858544e-07, "loss": 0.4954, "step": 1248 }, { "epoch": 10.675213675213675, "grad_norm": 1.8882904303111139, "learning_rate": 9.573222713394513e-07, "loss": 0.3261, "step": 1249 }, { "epoch": 10.683760683760683, "grad_norm": 1.10929175594392, "learning_rate": 9.538031743343628e-07, "loss": 0.5187, "step": 1250 }, { "epoch": 10.692307692307692, "grad_norm": 1.1140665621309356, "learning_rate": 9.502890319471491e-07, "loss": 0.3435, "step": 1251 }, { "epoch": 10.7008547008547, "grad_norm": 1.8794392120766952, "learning_rate": 9.467798554384946e-07, "loss": 0.2568, "step": 1252 }, { "epoch": 10.709401709401709, "grad_norm": 1.221136395755458, "learning_rate": 9.432756560531691e-07, "loss": 0.2727, "step": 1253 }, { "epoch": 10.717948717948717, "grad_norm": 1.370772824557499, "learning_rate": 9.397764450199937e-07, "loss": 0.1289, "step": 1254 }, { "epoch": 10.726495726495726, "grad_norm": 1.0055054063061366, "learning_rate": 9.362822335518062e-07, "loss": 0.441, "step": 1255 }, { "epoch": 10.735042735042736, "grad_norm": 1.134449984636529, "learning_rate": 9.327930328454249e-07, "loss": 0.2073, "step": 1256 }, { "epoch": 10.743589743589745, "grad_norm": 1.3407780383754453, "learning_rate": 9.293088540816081e-07, "loss": 0.3687, "step": 1257 }, { "epoch": 10.752136752136753, "grad_norm": 1.5464482793782732, "learning_rate": 9.258297084250256e-07, "loss": 0.3144, "step": 1258 }, { "epoch": 10.760683760683762, "grad_norm": 1.0417626287781558, "learning_rate": 9.22355607024217e-07, "loss": 0.2077, "step": 1259 }, { "epoch": 10.76923076923077, "grad_norm": 1.6811935785110077, "learning_rate": 9.188865610115572e-07, "loss": 0.2955, "step": 1260 }, { "epoch": 10.777777777777779, "grad_norm": 2.27870136281711, "learning_rate": 9.154225815032242e-07, "loss": 0.308, "step": 1261 }, { "epoch": 10.786324786324787, "grad_norm": 1.1647120152217134, "learning_rate": 9.119636795991605e-07, "loss": 0.2365, "step": 1262 }, { "epoch": 10.794871794871796, "grad_norm": 2.012077640003533, "learning_rate": 9.085098663830366e-07, "loss": 0.3534, "step": 1263 }, { "epoch": 10.803418803418804, "grad_norm": 1.017212268548294, "learning_rate": 9.050611529222167e-07, "loss": 0.2761, "step": 1264 }, { "epoch": 10.811965811965813, "grad_norm": 1.102217264281939, "learning_rate": 9.01617550267726e-07, "loss": 0.4417, "step": 1265 }, { "epoch": 10.820512820512821, "grad_norm": 1.1395609377381528, "learning_rate": 8.98179069454209e-07, "loss": 0.2634, "step": 1266 }, { "epoch": 10.82905982905983, "grad_norm": 1.1768625766970986, "learning_rate": 8.947457214999006e-07, "loss": 0.3728, "step": 1267 }, { "epoch": 10.837606837606838, "grad_norm": 0.999996881687392, "learning_rate": 8.91317517406588e-07, "loss": 0.4249, "step": 1268 }, { "epoch": 10.846153846153847, "grad_norm": 1.2213184518043707, "learning_rate": 8.878944681595742e-07, "loss": 0.2689, "step": 1269 }, { "epoch": 10.854700854700855, "grad_norm": 1.2397285549777677, "learning_rate": 8.844765847276432e-07, "loss": 0.2904, "step": 1270 }, { "epoch": 10.863247863247864, "grad_norm": 1.1401456049615335, "learning_rate": 8.810638780630279e-07, "loss": 0.229, "step": 1271 }, { "epoch": 10.871794871794872, "grad_norm": 1.7363838065776036, "learning_rate": 8.776563591013729e-07, "loss": 0.1458, "step": 1272 }, { "epoch": 10.88034188034188, "grad_norm": 1.2492147539226588, "learning_rate": 8.742540387616966e-07, "loss": 0.541, "step": 1273 }, { "epoch": 10.88888888888889, "grad_norm": 1.054703332522137, "learning_rate": 8.708569279463622e-07, "loss": 0.5736, "step": 1274 }, { "epoch": 10.897435897435898, "grad_norm": 1.152302935892294, "learning_rate": 8.674650375410379e-07, "loss": 0.2423, "step": 1275 }, { "epoch": 10.905982905982906, "grad_norm": 1.2563902276802954, "learning_rate": 8.640783784146625e-07, "loss": 0.472, "step": 1276 }, { "epoch": 10.914529914529915, "grad_norm": 2.468428865936901, "learning_rate": 8.606969614194144e-07, "loss": 0.3641, "step": 1277 }, { "epoch": 10.923076923076923, "grad_norm": 1.219505533382249, "learning_rate": 8.573207973906736e-07, "loss": 0.3872, "step": 1278 }, { "epoch": 10.931623931623932, "grad_norm": 1.269203003869562, "learning_rate": 8.539498971469848e-07, "loss": 0.2392, "step": 1279 }, { "epoch": 10.94017094017094, "grad_norm": 0.9334967508461468, "learning_rate": 8.505842714900298e-07, "loss": 0.2905, "step": 1280 }, { "epoch": 10.948717948717949, "grad_norm": 1.5491194442069778, "learning_rate": 8.472239312045851e-07, "loss": 0.4097, "step": 1281 }, { "epoch": 10.957264957264957, "grad_norm": 1.175632296882277, "learning_rate": 8.438688870584913e-07, "loss": 0.3536, "step": 1282 }, { "epoch": 10.965811965811966, "grad_norm": 1.6321024132119002, "learning_rate": 8.405191498026197e-07, "loss": 0.3949, "step": 1283 }, { "epoch": 10.974358974358974, "grad_norm": 1.5361242486173952, "learning_rate": 8.371747301708358e-07, "loss": 0.3731, "step": 1284 }, { "epoch": 10.982905982905983, "grad_norm": 1.6109773712263131, "learning_rate": 8.338356388799637e-07, "loss": 0.3013, "step": 1285 }, { "epoch": 10.991452991452991, "grad_norm": 1.3208056491021436, "learning_rate": 8.305018866297562e-07, "loss": 0.312, "step": 1286 }, { "epoch": 11.0, "grad_norm": 1.2095249712528178, "learning_rate": 8.271734841028553e-07, "loss": 0.329, "step": 1287 }, { "epoch": 11.008547008547009, "grad_norm": 1.1339926584737854, "learning_rate": 8.238504419647602e-07, "loss": 0.3453, "step": 1288 }, { "epoch": 11.017094017094017, "grad_norm": 2.285728413611872, "learning_rate": 8.205327708637958e-07, "loss": 0.2377, "step": 1289 }, { "epoch": 11.025641025641026, "grad_norm": 0.8347020886437849, "learning_rate": 8.172204814310741e-07, "loss": 0.2597, "step": 1290 }, { "epoch": 11.034188034188034, "grad_norm": 0.9894332228301046, "learning_rate": 8.139135842804638e-07, "loss": 0.2353, "step": 1291 }, { "epoch": 11.042735042735043, "grad_norm": 1.1961555891311104, "learning_rate": 8.106120900085526e-07, "loss": 0.2439, "step": 1292 }, { "epoch": 11.051282051282051, "grad_norm": 2.4837226139453503, "learning_rate": 8.073160091946156e-07, "loss": 0.2131, "step": 1293 }, { "epoch": 11.05982905982906, "grad_norm": 1.2402400103690412, "learning_rate": 8.040253524005834e-07, "loss": 0.3762, "step": 1294 }, { "epoch": 11.068376068376068, "grad_norm": 1.0670179465841032, "learning_rate": 8.007401301710022e-07, "loss": 0.3589, "step": 1295 }, { "epoch": 11.076923076923077, "grad_norm": 1.2043916022680978, "learning_rate": 7.974603530330069e-07, "loss": 0.2431, "step": 1296 }, { "epoch": 11.085470085470085, "grad_norm": 1.0286938560514645, "learning_rate": 7.941860314962843e-07, "loss": 0.0952, "step": 1297 }, { "epoch": 11.094017094017094, "grad_norm": 1.0664360647879936, "learning_rate": 7.909171760530351e-07, "loss": 0.2457, "step": 1298 }, { "epoch": 11.102564102564102, "grad_norm": 3.129024390617159, "learning_rate": 7.876537971779493e-07, "loss": 0.1881, "step": 1299 }, { "epoch": 11.11111111111111, "grad_norm": 1.0282067050352575, "learning_rate": 7.843959053281663e-07, "loss": 0.4266, "step": 1300 }, { "epoch": 11.11965811965812, "grad_norm": 1.1975968400831338, "learning_rate": 7.811435109432417e-07, "loss": 0.4366, "step": 1301 }, { "epoch": 11.128205128205128, "grad_norm": 1.0744374162194799, "learning_rate": 7.778966244451169e-07, "loss": 0.2198, "step": 1302 }, { "epoch": 11.136752136752136, "grad_norm": 1.1029563296447737, "learning_rate": 7.746552562380829e-07, "loss": 0.3467, "step": 1303 }, { "epoch": 11.145299145299145, "grad_norm": 1.1960864725036724, "learning_rate": 7.714194167087466e-07, "loss": 0.3032, "step": 1304 }, { "epoch": 11.153846153846153, "grad_norm": 1.0760347397890864, "learning_rate": 7.681891162260016e-07, "loss": 0.1681, "step": 1305 }, { "epoch": 11.162393162393162, "grad_norm": 0.9750236130397525, "learning_rate": 7.649643651409916e-07, "loss": 0.3941, "step": 1306 }, { "epoch": 11.17094017094017, "grad_norm": 1.1613839311484029, "learning_rate": 7.617451737870754e-07, "loss": 0.4137, "step": 1307 }, { "epoch": 11.179487179487179, "grad_norm": 1.062582908339486, "learning_rate": 7.585315524797998e-07, "loss": 0.28, "step": 1308 }, { "epoch": 11.188034188034187, "grad_norm": 0.8296624704516735, "learning_rate": 7.553235115168598e-07, "loss": 0.1661, "step": 1309 }, { "epoch": 11.196581196581196, "grad_norm": 1.0142078743901737, "learning_rate": 7.521210611780715e-07, "loss": 0.1358, "step": 1310 }, { "epoch": 11.205128205128204, "grad_norm": 1.0029814830645243, "learning_rate": 7.489242117253342e-07, "loss": 0.2322, "step": 1311 }, { "epoch": 11.213675213675213, "grad_norm": 1.4087404893921196, "learning_rate": 7.457329734026012e-07, "loss": 0.2598, "step": 1312 }, { "epoch": 11.222222222222221, "grad_norm": 1.2902537318460452, "learning_rate": 7.425473564358457e-07, "loss": 0.2689, "step": 1313 }, { "epoch": 11.23076923076923, "grad_norm": 1.3368847754229851, "learning_rate": 7.393673710330271e-07, "loss": 0.1781, "step": 1314 }, { "epoch": 11.239316239316238, "grad_norm": 1.2385657214763104, "learning_rate": 7.361930273840581e-07, "loss": 0.6036, "step": 1315 }, { "epoch": 11.247863247863247, "grad_norm": 1.1567242628436323, "learning_rate": 7.330243356607758e-07, "loss": 0.4502, "step": 1316 }, { "epoch": 11.256410256410255, "grad_norm": 1.0159771072872956, "learning_rate": 7.298613060169035e-07, "loss": 0.3977, "step": 1317 }, { "epoch": 11.264957264957266, "grad_norm": 1.3932241780307368, "learning_rate": 7.267039485880225e-07, "loss": 0.3772, "step": 1318 }, { "epoch": 11.273504273504274, "grad_norm": 2.6569957477431574, "learning_rate": 7.235522734915393e-07, "loss": 0.1337, "step": 1319 }, { "epoch": 11.282051282051283, "grad_norm": 1.0130803300138838, "learning_rate": 7.204062908266491e-07, "loss": 0.2043, "step": 1320 }, { "epoch": 11.290598290598291, "grad_norm": 1.2815538738053691, "learning_rate": 7.172660106743073e-07, "loss": 0.5477, "step": 1321 }, { "epoch": 11.2991452991453, "grad_norm": 1.1054111222974445, "learning_rate": 7.14131443097198e-07, "loss": 0.3482, "step": 1322 }, { "epoch": 11.307692307692308, "grad_norm": 0.9503250393747996, "learning_rate": 7.110025981396976e-07, "loss": 0.1972, "step": 1323 }, { "epoch": 11.316239316239317, "grad_norm": 1.3510519482608403, "learning_rate": 7.078794858278462e-07, "loss": 0.2791, "step": 1324 }, { "epoch": 11.324786324786325, "grad_norm": 1.1784971548342345, "learning_rate": 7.047621161693152e-07, "loss": 0.3825, "step": 1325 }, { "epoch": 11.333333333333334, "grad_norm": 1.0519903179740981, "learning_rate": 7.016504991533727e-07, "loss": 0.1166, "step": 1326 }, { "epoch": 11.341880341880342, "grad_norm": 1.230631119598931, "learning_rate": 6.985446447508526e-07, "loss": 0.3721, "step": 1327 }, { "epoch": 11.350427350427351, "grad_norm": 1.5015192550894227, "learning_rate": 6.954445629141246e-07, "loss": 0.4144, "step": 1328 }, { "epoch": 11.35897435897436, "grad_norm": 1.0329732984020763, "learning_rate": 6.923502635770618e-07, "loss": 0.3516, "step": 1329 }, { "epoch": 11.367521367521368, "grad_norm": 1.0689751035375088, "learning_rate": 6.892617566550044e-07, "loss": 0.3322, "step": 1330 }, { "epoch": 11.376068376068377, "grad_norm": 1.0234832818727786, "learning_rate": 6.861790520447356e-07, "loss": 0.1614, "step": 1331 }, { "epoch": 11.384615384615385, "grad_norm": 1.3705378171282174, "learning_rate": 6.831021596244425e-07, "loss": 0.1148, "step": 1332 }, { "epoch": 11.393162393162394, "grad_norm": 2.687284547442591, "learning_rate": 6.800310892536884e-07, "loss": 0.3965, "step": 1333 }, { "epoch": 11.401709401709402, "grad_norm": 1.122865008233458, "learning_rate": 6.769658507733815e-07, "loss": 0.2215, "step": 1334 }, { "epoch": 11.41025641025641, "grad_norm": 1.4255571677021555, "learning_rate": 6.739064540057425e-07, "loss": 0.3261, "step": 1335 }, { "epoch": 11.418803418803419, "grad_norm": 1.000000239576537, "learning_rate": 6.708529087542717e-07, "loss": 0.3153, "step": 1336 }, { "epoch": 11.427350427350428, "grad_norm": 1.6496826359295964, "learning_rate": 6.678052248037184e-07, "loss": 0.3163, "step": 1337 }, { "epoch": 11.435897435897436, "grad_norm": 1.2866215785516808, "learning_rate": 6.64763411920053e-07, "loss": 0.3916, "step": 1338 }, { "epoch": 11.444444444444445, "grad_norm": 1.0576686463394254, "learning_rate": 6.617274798504286e-07, "loss": 0.1983, "step": 1339 }, { "epoch": 11.452991452991453, "grad_norm": 1.1094778246746804, "learning_rate": 6.586974383231573e-07, "loss": 0.1522, "step": 1340 }, { "epoch": 11.461538461538462, "grad_norm": 1.0465398985429073, "learning_rate": 6.556732970476748e-07, "loss": 0.3405, "step": 1341 }, { "epoch": 11.47008547008547, "grad_norm": 1.0863827719627053, "learning_rate": 6.526550657145089e-07, "loss": 0.1914, "step": 1342 }, { "epoch": 11.478632478632479, "grad_norm": 1.2352159374155984, "learning_rate": 6.496427539952499e-07, "loss": 0.2707, "step": 1343 }, { "epoch": 11.487179487179487, "grad_norm": 1.1636422042646013, "learning_rate": 6.4663637154252e-07, "loss": 0.3073, "step": 1344 }, { "epoch": 11.495726495726496, "grad_norm": 0.9400787003787303, "learning_rate": 6.436359279899426e-07, "loss": 0.2746, "step": 1345 }, { "epoch": 11.504273504273504, "grad_norm": 1.7599302281828313, "learning_rate": 6.406414329521079e-07, "loss": 0.3627, "step": 1346 }, { "epoch": 11.512820512820513, "grad_norm": 1.0452282974681053, "learning_rate": 6.376528960245476e-07, "loss": 0.3472, "step": 1347 }, { "epoch": 11.521367521367521, "grad_norm": 1.1671960949053146, "learning_rate": 6.346703267836998e-07, "loss": 0.4321, "step": 1348 }, { "epoch": 11.52991452991453, "grad_norm": 1.0530089555475837, "learning_rate": 6.316937347868787e-07, "loss": 0.1937, "step": 1349 }, { "epoch": 11.538461538461538, "grad_norm": 1.519317404025633, "learning_rate": 6.28723129572247e-07, "loss": 0.4331, "step": 1350 }, { "epoch": 11.547008547008547, "grad_norm": 1.0524691505080326, "learning_rate": 6.257585206587843e-07, "loss": 0.2391, "step": 1351 }, { "epoch": 11.555555555555555, "grad_norm": 2.2094469058936594, "learning_rate": 6.227999175462521e-07, "loss": 0.5341, "step": 1352 }, { "epoch": 11.564102564102564, "grad_norm": 1.0170031770981227, "learning_rate": 6.198473297151705e-07, "loss": 0.2465, "step": 1353 }, { "epoch": 11.572649572649572, "grad_norm": 1.2489717726242522, "learning_rate": 6.169007666267824e-07, "loss": 0.4308, "step": 1354 }, { "epoch": 11.581196581196581, "grad_norm": 1.4302549005618361, "learning_rate": 6.139602377230247e-07, "loss": 0.3328, "step": 1355 }, { "epoch": 11.58974358974359, "grad_norm": 1.0017349912737998, "learning_rate": 6.110257524264998e-07, "loss": 0.1254, "step": 1356 }, { "epoch": 11.598290598290598, "grad_norm": 1.0916907629282087, "learning_rate": 6.080973201404444e-07, "loss": 0.5605, "step": 1357 }, { "epoch": 11.606837606837606, "grad_norm": 1.0348888975048909, "learning_rate": 6.051749502486967e-07, "loss": 0.2208, "step": 1358 }, { "epoch": 11.615384615384615, "grad_norm": 1.3165290208428866, "learning_rate": 6.022586521156714e-07, "loss": 0.2646, "step": 1359 }, { "epoch": 11.623931623931623, "grad_norm": 1.2837933636074768, "learning_rate": 5.993484350863246e-07, "loss": 0.164, "step": 1360 }, { "epoch": 11.632478632478632, "grad_norm": 2.412451254753726, "learning_rate": 5.964443084861265e-07, "loss": 0.0772, "step": 1361 }, { "epoch": 11.64102564102564, "grad_norm": 1.7479905420082966, "learning_rate": 5.935462816210325e-07, "loss": 0.3332, "step": 1362 }, { "epoch": 11.649572649572649, "grad_norm": 0.9679775296832205, "learning_rate": 5.906543637774512e-07, "loss": 0.3549, "step": 1363 }, { "epoch": 11.658119658119658, "grad_norm": 1.3334212644868122, "learning_rate": 5.877685642222163e-07, "loss": 0.4342, "step": 1364 }, { "epoch": 11.666666666666666, "grad_norm": 1.6850702917590108, "learning_rate": 5.848888922025553e-07, "loss": 0.3346, "step": 1365 }, { "epoch": 11.675213675213675, "grad_norm": 1.1306411787231243, "learning_rate": 5.820153569460596e-07, "loss": 0.1642, "step": 1366 }, { "epoch": 11.683760683760683, "grad_norm": 1.8273706494458912, "learning_rate": 5.791479676606587e-07, "loss": 0.2132, "step": 1367 }, { "epoch": 11.692307692307692, "grad_norm": 1.2185234414343529, "learning_rate": 5.762867335345851e-07, "loss": 0.454, "step": 1368 }, { "epoch": 11.7008547008547, "grad_norm": 1.315369472839878, "learning_rate": 5.734316637363505e-07, "loss": 0.3286, "step": 1369 }, { "epoch": 11.709401709401709, "grad_norm": 1.2250823833793434, "learning_rate": 5.705827674147125e-07, "loss": 0.2436, "step": 1370 }, { "epoch": 11.717948717948717, "grad_norm": 1.1492279019896923, "learning_rate": 5.67740053698646e-07, "loss": 0.4002, "step": 1371 }, { "epoch": 11.726495726495726, "grad_norm": 1.35392039503661, "learning_rate": 5.649035316973142e-07, "loss": 0.4827, "step": 1372 }, { "epoch": 11.735042735042736, "grad_norm": 1.0492032813030379, "learning_rate": 5.620732105000415e-07, "loss": 0.1662, "step": 1373 }, { "epoch": 11.743589743589745, "grad_norm": 1.2987614026152967, "learning_rate": 5.5924909917628e-07, "loss": 0.3225, "step": 1374 }, { "epoch": 11.752136752136753, "grad_norm": 1.1037691937130272, "learning_rate": 5.564312067755856e-07, "loss": 0.2383, "step": 1375 }, { "epoch": 11.760683760683762, "grad_norm": 1.4303796759619123, "learning_rate": 5.536195423275839e-07, "loss": 0.3351, "step": 1376 }, { "epoch": 11.76923076923077, "grad_norm": 1.0452517599489266, "learning_rate": 5.508141148419443e-07, "loss": 0.1719, "step": 1377 }, { "epoch": 11.777777777777779, "grad_norm": 1.2734071534409364, "learning_rate": 5.48014933308352e-07, "loss": 0.3308, "step": 1378 }, { "epoch": 11.786324786324787, "grad_norm": 1.3858295600616295, "learning_rate": 5.45222006696477e-07, "loss": 0.3838, "step": 1379 }, { "epoch": 11.794871794871796, "grad_norm": 1.0090102173530728, "learning_rate": 5.424353439559446e-07, "loss": 0.3294, "step": 1380 }, { "epoch": 11.803418803418804, "grad_norm": 1.5702211951848428, "learning_rate": 5.396549540163106e-07, "loss": 0.3281, "step": 1381 }, { "epoch": 11.811965811965813, "grad_norm": 2.0057111464447672, "learning_rate": 5.36880845787028e-07, "loss": 0.2921, "step": 1382 }, { "epoch": 11.820512820512821, "grad_norm": 1.21502886222698, "learning_rate": 5.341130281574233e-07, "loss": 0.4285, "step": 1383 }, { "epoch": 11.82905982905983, "grad_norm": 1.1066519508861632, "learning_rate": 5.313515099966627e-07, "loss": 0.3674, "step": 1384 }, { "epoch": 11.837606837606838, "grad_norm": 1.4189752940939921, "learning_rate": 5.28596300153728e-07, "loss": 0.3384, "step": 1385 }, { "epoch": 11.846153846153847, "grad_norm": 1.0497925251189915, "learning_rate": 5.258474074573878e-07, "loss": 0.3128, "step": 1386 }, { "epoch": 11.854700854700855, "grad_norm": 1.5489841745380557, "learning_rate": 5.231048407161657e-07, "loss": 0.4025, "step": 1387 }, { "epoch": 11.863247863247864, "grad_norm": 0.9941263366209754, "learning_rate": 5.203686087183149e-07, "loss": 0.2513, "step": 1388 }, { "epoch": 11.871794871794872, "grad_norm": 1.4119603032969352, "learning_rate": 5.176387202317915e-07, "loss": 0.4545, "step": 1389 }, { "epoch": 11.88034188034188, "grad_norm": 1.0132661355105295, "learning_rate": 5.149151840042224e-07, "loss": 0.4799, "step": 1390 }, { "epoch": 11.88888888888889, "grad_norm": 1.1066098785113443, "learning_rate": 5.121980087628802e-07, "loss": 0.2781, "step": 1391 }, { "epoch": 11.897435897435898, "grad_norm": 0.9564853712646907, "learning_rate": 5.094872032146562e-07, "loss": 0.2008, "step": 1392 }, { "epoch": 11.905982905982906, "grad_norm": 1.4412517815970023, "learning_rate": 5.06782776046027e-07, "loss": 0.3013, "step": 1393 }, { "epoch": 11.914529914529915, "grad_norm": 1.0156654862669703, "learning_rate": 5.040847359230327e-07, "loss": 0.2307, "step": 1394 }, { "epoch": 11.923076923076923, "grad_norm": 1.0769449871390542, "learning_rate": 5.013930914912477e-07, "loss": 0.3282, "step": 1395 }, { "epoch": 11.931623931623932, "grad_norm": 1.096688158237559, "learning_rate": 4.98707851375749e-07, "loss": 0.2744, "step": 1396 }, { "epoch": 11.94017094017094, "grad_norm": 1.4568030911492071, "learning_rate": 4.96029024181095e-07, "loss": 0.3547, "step": 1397 }, { "epoch": 11.948717948717949, "grad_norm": 1.101802823212953, "learning_rate": 4.933566184912931e-07, "loss": 0.2269, "step": 1398 }, { "epoch": 11.957264957264957, "grad_norm": 1.3169879884728675, "learning_rate": 4.906906428697736e-07, "loss": 0.3043, "step": 1399 }, { "epoch": 11.965811965811966, "grad_norm": 1.5033493354032195, "learning_rate": 4.880311058593617e-07, "loss": 0.5169, "step": 1400 }, { "epoch": 11.974358974358974, "grad_norm": 1.8372923383003767, "learning_rate": 4.853780159822521e-07, "loss": 0.4537, "step": 1401 }, { "epoch": 11.982905982905983, "grad_norm": 1.1792625592862642, "learning_rate": 4.827313817399809e-07, "loss": 0.2065, "step": 1402 }, { "epoch": 11.991452991452991, "grad_norm": 1.126875070334918, "learning_rate": 4.800912116133955e-07, "loss": 0.2456, "step": 1403 }, { "epoch": 12.0, "grad_norm": 1.5697513878589628, "learning_rate": 4.774575140626317e-07, "loss": 0.4124, "step": 1404 }, { "epoch": 12.008547008547009, "grad_norm": 0.8438163695497493, "learning_rate": 4.748302975270838e-07, "loss": 0.3199, "step": 1405 }, { "epoch": 12.017094017094017, "grad_norm": 1.5079877456318747, "learning_rate": 4.7220957042537793e-07, "loss": 0.4273, "step": 1406 }, { "epoch": 12.025641025641026, "grad_norm": 1.0229727290063586, "learning_rate": 4.6959534115534665e-07, "loss": 0.1734, "step": 1407 }, { "epoch": 12.034188034188034, "grad_norm": 0.97072370113068, "learning_rate": 4.669876180940014e-07, "loss": 0.2477, "step": 1408 }, { "epoch": 12.042735042735043, "grad_norm": 1.211064910341796, "learning_rate": 4.6438640959750285e-07, "loss": 0.2913, "step": 1409 }, { "epoch": 12.051282051282051, "grad_norm": 2.8135483449892176, "learning_rate": 4.617917240011394e-07, "loss": 0.2096, "step": 1410 }, { "epoch": 12.05982905982906, "grad_norm": 1.0958182096094111, "learning_rate": 4.592035696192948e-07, "loss": 0.1916, "step": 1411 }, { "epoch": 12.068376068376068, "grad_norm": 1.2127130625533227, "learning_rate": 4.566219547454251e-07, "loss": 0.2594, "step": 1412 }, { "epoch": 12.076923076923077, "grad_norm": 1.1936809604241543, "learning_rate": 4.5404688765203236e-07, "loss": 0.2925, "step": 1413 }, { "epoch": 12.085470085470085, "grad_norm": 1.1130311861207984, "learning_rate": 4.514783765906369e-07, "loss": 0.2031, "step": 1414 }, { "epoch": 12.094017094017094, "grad_norm": 1.3007322916470179, "learning_rate": 4.489164297917492e-07, "loss": 0.4502, "step": 1415 }, { "epoch": 12.102564102564102, "grad_norm": 1.0994068000291886, "learning_rate": 4.463610554648459e-07, "loss": 0.4473, "step": 1416 }, { "epoch": 12.11111111111111, "grad_norm": 1.1115069843985332, "learning_rate": 4.438122617983442e-07, "loss": 0.1048, "step": 1417 }, { "epoch": 12.11965811965812, "grad_norm": 1.0846637850713525, "learning_rate": 4.4127005695957374e-07, "loss": 0.2828, "step": 1418 }, { "epoch": 12.128205128205128, "grad_norm": 0.8866260859036948, "learning_rate": 4.3873444909474985e-07, "loss": 0.216, "step": 1419 }, { "epoch": 12.136752136752136, "grad_norm": 1.092900767346248, "learning_rate": 4.3620544632895e-07, "loss": 0.274, "step": 1420 }, { "epoch": 12.145299145299145, "grad_norm": 1.0884795291953004, "learning_rate": 4.336830567660855e-07, "loss": 0.4049, "step": 1421 }, { "epoch": 12.153846153846153, "grad_norm": 1.2438059455775952, "learning_rate": 4.311672884888757e-07, "loss": 0.4181, "step": 1422 }, { "epoch": 12.162393162393162, "grad_norm": 1.2209213090493276, "learning_rate": 4.286581495588249e-07, "loss": 0.3342, "step": 1423 }, { "epoch": 12.17094017094017, "grad_norm": 1.0078786941883342, "learning_rate": 4.2615564801619325e-07, "loss": 0.4002, "step": 1424 }, { "epoch": 12.179487179487179, "grad_norm": 1.3416635010873221, "learning_rate": 4.2365979187997094e-07, "loss": 0.2419, "step": 1425 }, { "epoch": 12.188034188034187, "grad_norm": 1.1175759363671287, "learning_rate": 4.21170589147856e-07, "loss": 0.2284, "step": 1426 }, { "epoch": 12.196581196581196, "grad_norm": 1.2925695930187564, "learning_rate": 4.1868804779622437e-07, "loss": 0.3257, "step": 1427 }, { "epoch": 12.205128205128204, "grad_norm": 0.9971032549939878, "learning_rate": 4.1621217578010686e-07, "loss": 0.2485, "step": 1428 }, { "epoch": 12.213675213675213, "grad_norm": 1.46223278685261, "learning_rate": 4.137429810331639e-07, "loss": 0.4119, "step": 1429 }, { "epoch": 12.222222222222221, "grad_norm": 1.0768684889712687, "learning_rate": 4.1128047146765936e-07, "loss": 0.2551, "step": 1430 }, { "epoch": 12.23076923076923, "grad_norm": 0.9499969600745545, "learning_rate": 4.088246549744332e-07, "loss": 0.3378, "step": 1431 }, { "epoch": 12.239316239316238, "grad_norm": 1.06970202901976, "learning_rate": 4.063755394228811e-07, "loss": 0.3059, "step": 1432 }, { "epoch": 12.247863247863247, "grad_norm": 1.1829683368454302, "learning_rate": 4.039331326609239e-07, "loss": 0.217, "step": 1433 }, { "epoch": 12.256410256410255, "grad_norm": 1.4564822793866774, "learning_rate": 4.0149744251498537e-07, "loss": 0.2916, "step": 1434 }, { "epoch": 12.264957264957266, "grad_norm": 1.3856240797241763, "learning_rate": 3.990684767899677e-07, "loss": 0.297, "step": 1435 }, { "epoch": 12.273504273504274, "grad_norm": 1.256964556701651, "learning_rate": 3.9664624326922447e-07, "loss": 0.4278, "step": 1436 }, { "epoch": 12.282051282051283, "grad_norm": 1.265967197096582, "learning_rate": 3.9423074971453785e-07, "loss": 0.3981, "step": 1437 }, { "epoch": 12.290598290598291, "grad_norm": 1.3605620355633015, "learning_rate": 3.918220038660908e-07, "loss": 0.2225, "step": 1438 }, { "epoch": 12.2991452991453, "grad_norm": 0.8642733786395712, "learning_rate": 3.8942001344244416e-07, "loss": 0.2764, "step": 1439 }, { "epoch": 12.307692307692308, "grad_norm": 1.0067964022511735, "learning_rate": 3.8702478614051353e-07, "loss": 0.2164, "step": 1440 }, { "epoch": 12.316239316239317, "grad_norm": 1.0214820970789222, "learning_rate": 3.846363296355404e-07, "loss": 0.3077, "step": 1441 }, { "epoch": 12.324786324786325, "grad_norm": 0.8436702793433775, "learning_rate": 3.822546515810724e-07, "loss": 0.308, "step": 1442 }, { "epoch": 12.333333333333334, "grad_norm": 0.9345090665422185, "learning_rate": 3.798797596089351e-07, "loss": 0.1793, "step": 1443 }, { "epoch": 12.341880341880342, "grad_norm": 0.8187825789284637, "learning_rate": 3.7751166132920877e-07, "loss": 0.1257, "step": 1444 }, { "epoch": 12.350427350427351, "grad_norm": 0.970866368219796, "learning_rate": 3.751503643302035e-07, "loss": 0.1998, "step": 1445 }, { "epoch": 12.35897435897436, "grad_norm": 1.5138688300904741, "learning_rate": 3.727958761784375e-07, "loss": 0.421, "step": 1446 }, { "epoch": 12.367521367521368, "grad_norm": 1.0829254665533683, "learning_rate": 3.7044820441860806e-07, "loss": 0.4511, "step": 1447 }, { "epoch": 12.376068376068377, "grad_norm": 1.1777412706647559, "learning_rate": 3.681073565735718e-07, "loss": 0.3406, "step": 1448 }, { "epoch": 12.384615384615385, "grad_norm": 1.3337291401337419, "learning_rate": 3.6577334014432003e-07, "loss": 0.4862, "step": 1449 }, { "epoch": 12.393162393162394, "grad_norm": 0.9920625204158359, "learning_rate": 3.634461626099495e-07, "loss": 0.3188, "step": 1450 }, { "epoch": 12.401709401709402, "grad_norm": 1.1732324209380567, "learning_rate": 3.611258314276461e-07, "loss": 0.2023, "step": 1451 }, { "epoch": 12.41025641025641, "grad_norm": 1.4368434603949196, "learning_rate": 3.5881235403265713e-07, "loss": 0.4928, "step": 1452 }, { "epoch": 12.418803418803419, "grad_norm": 1.209880026192477, "learning_rate": 3.56505737838265e-07, "loss": 0.5794, "step": 1453 }, { "epoch": 12.427350427350428, "grad_norm": 1.0765303844783172, "learning_rate": 3.5420599023576946e-07, "loss": 0.3588, "step": 1454 }, { "epoch": 12.435897435897436, "grad_norm": 1.1952856792258182, "learning_rate": 3.51913118594458e-07, "loss": 0.3529, "step": 1455 }, { "epoch": 12.444444444444445, "grad_norm": 1.4189895503204382, "learning_rate": 3.4962713026158697e-07, "loss": 0.3579, "step": 1456 }, { "epoch": 12.452991452991453, "grad_norm": 1.0904847944667027, "learning_rate": 3.473480325623535e-07, "loss": 0.3049, "step": 1457 }, { "epoch": 12.461538461538462, "grad_norm": 1.0804400329952155, "learning_rate": 3.450758327998768e-07, "loss": 0.3236, "step": 1458 }, { "epoch": 12.47008547008547, "grad_norm": 0.9096956137546562, "learning_rate": 3.428105382551716e-07, "loss": 0.3332, "step": 1459 }, { "epoch": 12.478632478632479, "grad_norm": 0.9262886907290493, "learning_rate": 3.405521561871247e-07, "loss": 0.164, "step": 1460 }, { "epoch": 12.487179487179487, "grad_norm": 1.0500627883818996, "learning_rate": 3.3830069383247343e-07, "loss": 0.5438, "step": 1461 }, { "epoch": 12.495726495726496, "grad_norm": 1.2508699587963135, "learning_rate": 3.3605615840578224e-07, "loss": 0.2713, "step": 1462 }, { "epoch": 12.504273504273504, "grad_norm": 1.2153830905771117, "learning_rate": 3.3381855709941733e-07, "loss": 0.1354, "step": 1463 }, { "epoch": 12.512820512820513, "grad_norm": 1.1888683890499405, "learning_rate": 3.315878970835268e-07, "loss": 0.2409, "step": 1464 }, { "epoch": 12.521367521367521, "grad_norm": 1.1059794294375975, "learning_rate": 3.293641855060162e-07, "loss": 0.3336, "step": 1465 }, { "epoch": 12.52991452991453, "grad_norm": 1.1840113168496267, "learning_rate": 3.2714742949252447e-07, "loss": 0.2742, "step": 1466 }, { "epoch": 12.538461538461538, "grad_norm": 2.1592550264073345, "learning_rate": 3.249376361464021e-07, "loss": 0.2109, "step": 1467 }, { "epoch": 12.547008547008547, "grad_norm": 1.0812187409812728, "learning_rate": 3.227348125486904e-07, "loss": 0.2329, "step": 1468 }, { "epoch": 12.555555555555555, "grad_norm": 1.1007960406636101, "learning_rate": 3.2053896575809426e-07, "loss": 0.3259, "step": 1469 }, { "epoch": 12.564102564102564, "grad_norm": 1.1818858763469258, "learning_rate": 3.1835010281096426e-07, "loss": 0.2348, "step": 1470 }, { "epoch": 12.572649572649572, "grad_norm": 1.036774434563237, "learning_rate": 3.1616823072127157e-07, "loss": 0.358, "step": 1471 }, { "epoch": 12.581196581196581, "grad_norm": 0.9144896563328214, "learning_rate": 3.1399335648058555e-07, "loss": 0.2064, "step": 1472 }, { "epoch": 12.58974358974359, "grad_norm": 1.1880151496219533, "learning_rate": 3.118254870580506e-07, "loss": 0.0903, "step": 1473 }, { "epoch": 12.598290598290598, "grad_norm": 1.086336466044215, "learning_rate": 3.096646294003675e-07, "loss": 0.339, "step": 1474 }, { "epoch": 12.606837606837606, "grad_norm": 1.446415527228014, "learning_rate": 3.075107904317667e-07, "loss": 0.5688, "step": 1475 }, { "epoch": 12.615384615384615, "grad_norm": 0.8813039270395069, "learning_rate": 3.0536397705398845e-07, "loss": 0.2948, "step": 1476 }, { "epoch": 12.623931623931623, "grad_norm": 0.9608868381854022, "learning_rate": 3.032241961462612e-07, "loss": 0.3666, "step": 1477 }, { "epoch": 12.632478632478632, "grad_norm": 1.0169318443426791, "learning_rate": 3.010914545652771e-07, "loss": 0.205, "step": 1478 }, { "epoch": 12.64102564102564, "grad_norm": 1.2527583493254357, "learning_rate": 2.9896575914517166e-07, "loss": 0.3132, "step": 1479 }, { "epoch": 12.649572649572649, "grad_norm": 1.4280662690242643, "learning_rate": 2.9684711669750313e-07, "loss": 0.1883, "step": 1480 }, { "epoch": 12.658119658119658, "grad_norm": 1.1301566337681153, "learning_rate": 2.9473553401122875e-07, "loss": 0.1991, "step": 1481 }, { "epoch": 12.666666666666666, "grad_norm": 1.0433471063867608, "learning_rate": 2.9263101785268253e-07, "loss": 0.2524, "step": 1482 }, { "epoch": 12.675213675213675, "grad_norm": 1.7274104052810784, "learning_rate": 2.9053357496555635e-07, "loss": 0.1876, "step": 1483 }, { "epoch": 12.683760683760683, "grad_norm": 1.036859974163086, "learning_rate": 2.8844321207087465e-07, "loss": 0.1416, "step": 1484 }, { "epoch": 12.692307692307692, "grad_norm": 0.7738557295444807, "learning_rate": 2.8635993586697555e-07, "loss": 0.1175, "step": 1485 }, { "epoch": 12.7008547008547, "grad_norm": 1.187284383819613, "learning_rate": 2.84283753029489e-07, "loss": 0.2627, "step": 1486 }, { "epoch": 12.709401709401709, "grad_norm": 0.8173656024155544, "learning_rate": 2.822146702113157e-07, "loss": 0.1562, "step": 1487 }, { "epoch": 12.717948717948717, "grad_norm": 1.1095747046438942, "learning_rate": 2.8015269404260333e-07, "loss": 0.1976, "step": 1488 }, { "epoch": 12.726495726495726, "grad_norm": 0.9916720254317867, "learning_rate": 2.780978311307278e-07, "loss": 0.1927, "step": 1489 }, { "epoch": 12.735042735042736, "grad_norm": 0.8982994853794152, "learning_rate": 2.7605008806027205e-07, "loss": 0.1149, "step": 1490 }, { "epoch": 12.743589743589745, "grad_norm": 0.8876994162052723, "learning_rate": 2.7400947139300443e-07, "loss": 0.3522, "step": 1491 }, { "epoch": 12.752136752136753, "grad_norm": 1.1108135115334203, "learning_rate": 2.7197598766785544e-07, "loss": 0.2567, "step": 1492 }, { "epoch": 12.760683760683762, "grad_norm": 0.9849565773663954, "learning_rate": 2.6994964340090163e-07, "loss": 0.2471, "step": 1493 }, { "epoch": 12.76923076923077, "grad_norm": 1.3189832219649773, "learning_rate": 2.679304450853401e-07, "loss": 0.2143, "step": 1494 }, { "epoch": 12.777777777777779, "grad_norm": 1.2483986620984422, "learning_rate": 2.6591839919146963e-07, "loss": 0.6708, "step": 1495 }, { "epoch": 12.786324786324787, "grad_norm": 1.1032130060771996, "learning_rate": 2.6391351216667053e-07, "loss": 0.373, "step": 1496 }, { "epoch": 12.794871794871796, "grad_norm": 1.2634362960620278, "learning_rate": 2.6191579043538333e-07, "loss": 0.1604, "step": 1497 }, { "epoch": 12.803418803418804, "grad_norm": 1.0765362403824454, "learning_rate": 2.599252403990873e-07, "loss": 0.3473, "step": 1498 }, { "epoch": 12.811965811965813, "grad_norm": 1.1233098805300556, "learning_rate": 2.5794186843628247e-07, "loss": 0.2566, "step": 1499 }, { "epoch": 12.820512820512821, "grad_norm": 1.0012569569825553, "learning_rate": 2.5596568090246546e-07, "loss": 0.2917, "step": 1500 }, { "epoch": 12.82905982905983, "grad_norm": 2.5769967962871263, "learning_rate": 2.539966841301117e-07, "loss": 0.2105, "step": 1501 }, { "epoch": 12.837606837606838, "grad_norm": 1.0551006567181496, "learning_rate": 2.5203488442865574e-07, "loss": 0.5103, "step": 1502 }, { "epoch": 12.846153846153847, "grad_norm": 1.0739659574771914, "learning_rate": 2.5008028808446995e-07, "loss": 0.3336, "step": 1503 }, { "epoch": 12.854700854700855, "grad_norm": 1.0004107986619046, "learning_rate": 2.481329013608419e-07, "loss": 0.189, "step": 1504 }, { "epoch": 12.863247863247864, "grad_norm": 1.198361498583922, "learning_rate": 2.4619273049796e-07, "loss": 0.2824, "step": 1505 }, { "epoch": 12.871794871794872, "grad_norm": 1.0439673382608252, "learning_rate": 2.4425978171288807e-07, "loss": 0.3656, "step": 1506 }, { "epoch": 12.88034188034188, "grad_norm": 2.5685003689848434, "learning_rate": 2.4233406119954756e-07, "loss": 0.1698, "step": 1507 }, { "epoch": 12.88888888888889, "grad_norm": 1.0793633165726377, "learning_rate": 2.404155751286988e-07, "loss": 0.2515, "step": 1508 }, { "epoch": 12.897435897435898, "grad_norm": 2.2961950625371492, "learning_rate": 2.385043296479195e-07, "loss": 0.1088, "step": 1509 }, { "epoch": 12.905982905982906, "grad_norm": 1.111817950970189, "learning_rate": 2.3660033088158646e-07, "loss": 0.3174, "step": 1510 }, { "epoch": 12.914529914529915, "grad_norm": 1.0582422873750743, "learning_rate": 2.3470358493085433e-07, "loss": 0.545, "step": 1511 }, { "epoch": 12.923076923076923, "grad_norm": 1.0443314605511564, "learning_rate": 2.3281409787363652e-07, "loss": 0.2725, "step": 1512 }, { "epoch": 12.931623931623932, "grad_norm": 2.47403877787119, "learning_rate": 2.309318757645873e-07, "loss": 0.1496, "step": 1513 }, { "epoch": 12.94017094017094, "grad_norm": 1.3943374589312603, "learning_rate": 2.2905692463508045e-07, "loss": 0.2497, "step": 1514 }, { "epoch": 12.948717948717949, "grad_norm": 0.9048792274362765, "learning_rate": 2.271892504931905e-07, "loss": 0.1958, "step": 1515 }, { "epoch": 12.957264957264957, "grad_norm": 1.3903380713051325, "learning_rate": 2.253288593236755e-07, "loss": 0.5352, "step": 1516 }, { "epoch": 12.965811965811966, "grad_norm": 1.312412248496653, "learning_rate": 2.234757570879534e-07, "loss": 0.4187, "step": 1517 }, { "epoch": 12.974358974358974, "grad_norm": 1.0728502906390496, "learning_rate": 2.2162994972408647e-07, "loss": 0.4241, "step": 1518 }, { "epoch": 12.982905982905983, "grad_norm": 1.4832237419691372, "learning_rate": 2.1979144314676239e-07, "loss": 0.3298, "step": 1519 }, { "epoch": 12.991452991452991, "grad_norm": 1.8596141275548963, "learning_rate": 2.17960243247273e-07, "loss": 0.2932, "step": 1520 }, { "epoch": 13.0, "grad_norm": 1.756329510002311, "learning_rate": 2.1613635589349756e-07, "loss": 0.3875, "step": 1521 }, { "epoch": 13.008547008547009, "grad_norm": 1.1505738852548575, "learning_rate": 2.1431978692988298e-07, "loss": 0.2159, "step": 1522 }, { "epoch": 13.017094017094017, "grad_norm": 1.0857319212781678, "learning_rate": 2.1251054217742484e-07, "loss": 0.2768, "step": 1523 }, { "epoch": 13.025641025641026, "grad_norm": 1.0267844636312484, "learning_rate": 2.107086274336484e-07, "loss": 0.3875, "step": 1524 }, { "epoch": 13.034188034188034, "grad_norm": 0.9343948903201362, "learning_rate": 2.0891404847259267e-07, "loss": 0.3016, "step": 1525 }, { "epoch": 13.042735042735043, "grad_norm": 1.155357237625711, "learning_rate": 2.0712681104478742e-07, "loss": 0.2082, "step": 1526 }, { "epoch": 13.051282051282051, "grad_norm": 1.1515917888558194, "learning_rate": 2.0534692087724017e-07, "loss": 0.4446, "step": 1527 }, { "epoch": 13.05982905982906, "grad_norm": 0.8190061036835171, "learning_rate": 2.0357438367341248e-07, "loss": 0.1272, "step": 1528 }, { "epoch": 13.068376068376068, "grad_norm": 0.7712922541814253, "learning_rate": 2.0180920511320562e-07, "loss": 0.0751, "step": 1529 }, { "epoch": 13.076923076923077, "grad_norm": 1.0294870961933071, "learning_rate": 2.0005139085293945e-07, "loss": 0.2942, "step": 1530 }, { "epoch": 13.085470085470085, "grad_norm": 0.8439613811790312, "learning_rate": 1.983009465253377e-07, "loss": 0.2488, "step": 1531 }, { "epoch": 13.094017094017094, "grad_norm": 1.1553682818767042, "learning_rate": 1.9655787773950691e-07, "loss": 0.4796, "step": 1532 }, { "epoch": 13.102564102564102, "grad_norm": 1.2396451404876954, "learning_rate": 1.9482219008091885e-07, "loss": 0.3161, "step": 1533 }, { "epoch": 13.11111111111111, "grad_norm": 0.8384235193749258, "learning_rate": 1.9309388911139427e-07, "loss": 0.2178, "step": 1534 }, { "epoch": 13.11965811965812, "grad_norm": 0.9678932445180488, "learning_rate": 1.9137298036908392e-07, "loss": 0.1476, "step": 1535 }, { "epoch": 13.128205128205128, "grad_norm": 1.14547754453679, "learning_rate": 1.896594693684503e-07, "loss": 0.3058, "step": 1536 }, { "epoch": 13.136752136752136, "grad_norm": 1.037919981668251, "learning_rate": 1.8795336160025157e-07, "loss": 0.2297, "step": 1537 }, { "epoch": 13.145299145299145, "grad_norm": 1.45584546731332, "learning_rate": 1.8625466253152314e-07, "loss": 0.6431, "step": 1538 }, { "epoch": 13.153846153846153, "grad_norm": 0.9426394319446211, "learning_rate": 1.8456337760555915e-07, "loss": 0.1778, "step": 1539 }, { "epoch": 13.162393162393162, "grad_norm": 1.3071195563971694, "learning_rate": 1.8287951224189555e-07, "loss": 0.3046, "step": 1540 }, { "epoch": 13.17094017094017, "grad_norm": 1.0013067940381606, "learning_rate": 1.8120307183629533e-07, "loss": 0.1752, "step": 1541 }, { "epoch": 13.179487179487179, "grad_norm": 1.009937942806977, "learning_rate": 1.7953406176072636e-07, "loss": 0.1444, "step": 1542 }, { "epoch": 13.188034188034187, "grad_norm": 0.907238022289693, "learning_rate": 1.778724873633486e-07, "loss": 0.2081, "step": 1543 }, { "epoch": 13.196581196581196, "grad_norm": 1.1561662315881491, "learning_rate": 1.7621835396849528e-07, "loss": 0.3784, "step": 1544 }, { "epoch": 13.205128205128204, "grad_norm": 1.0786060827716064, "learning_rate": 1.745716668766545e-07, "loss": 0.2069, "step": 1545 }, { "epoch": 13.213675213675213, "grad_norm": 1.143617253857377, "learning_rate": 1.7293243136445398e-07, "loss": 0.2093, "step": 1546 }, { "epoch": 13.222222222222221, "grad_norm": 0.9032057647054741, "learning_rate": 1.713006526846439e-07, "loss": 0.2647, "step": 1547 }, { "epoch": 13.23076923076923, "grad_norm": 1.1492208541704318, "learning_rate": 1.6967633606608082e-07, "loss": 0.2843, "step": 1548 }, { "epoch": 13.239316239316238, "grad_norm": 1.1662669216870896, "learning_rate": 1.6805948671370726e-07, "loss": 0.519, "step": 1549 }, { "epoch": 13.247863247863247, "grad_norm": 1.1127259115281096, "learning_rate": 1.6645010980854082e-07, "loss": 0.3714, "step": 1550 }, { "epoch": 13.256410256410255, "grad_norm": 0.78442378098213, "learning_rate": 1.6484821050765209e-07, "loss": 0.0919, "step": 1551 }, { "epoch": 13.264957264957266, "grad_norm": 1.1438475548638625, "learning_rate": 1.6325379394415168e-07, "loss": 0.4341, "step": 1552 }, { "epoch": 13.273504273504274, "grad_norm": 0.9868013923057376, "learning_rate": 1.6166686522717217e-07, "loss": 0.3411, "step": 1553 }, { "epoch": 13.282051282051283, "grad_norm": 0.906498367822162, "learning_rate": 1.600874294418528e-07, "loss": 0.1169, "step": 1554 }, { "epoch": 13.290598290598291, "grad_norm": 1.330656388274806, "learning_rate": 1.5851549164932118e-07, "loss": 0.3343, "step": 1555 }, { "epoch": 13.2991452991453, "grad_norm": 1.0982661982531574, "learning_rate": 1.569510568866803e-07, "loss": 0.1931, "step": 1556 }, { "epoch": 13.307692307692308, "grad_norm": 1.007238900807903, "learning_rate": 1.5539413016698923e-07, "loss": 0.3144, "step": 1557 }, { "epoch": 13.316239316239317, "grad_norm": 0.974750212176006, "learning_rate": 1.5384471647924781e-07, "loss": 0.435, "step": 1558 }, { "epoch": 13.324786324786325, "grad_norm": 1.181406302290183, "learning_rate": 1.5230282078838255e-07, "loss": 0.2805, "step": 1559 }, { "epoch": 13.333333333333334, "grad_norm": 1.0014146267263002, "learning_rate": 1.507684480352292e-07, "loss": 0.2635, "step": 1560 }, { "epoch": 13.341880341880342, "grad_norm": 1.2320491733226004, "learning_rate": 1.4924160313651598e-07, "loss": 0.3606, "step": 1561 }, { "epoch": 13.350427350427351, "grad_norm": 0.9911464162208959, "learning_rate": 1.4772229098485053e-07, "loss": 0.2405, "step": 1562 }, { "epoch": 13.35897435897436, "grad_norm": 1.2551361681677689, "learning_rate": 1.46210516448701e-07, "loss": 0.4468, "step": 1563 }, { "epoch": 13.367521367521368, "grad_norm": 1.0520156172205075, "learning_rate": 1.447062843723826e-07, "loss": 0.2223, "step": 1564 }, { "epoch": 13.376068376068377, "grad_norm": 0.9628315407802782, "learning_rate": 1.432095995760424e-07, "loss": 0.2264, "step": 1565 }, { "epoch": 13.384615384615385, "grad_norm": 1.2836910046347718, "learning_rate": 1.417204668556421e-07, "loss": 0.5595, "step": 1566 }, { "epoch": 13.393162393162394, "grad_norm": 1.1889920690605311, "learning_rate": 1.402388909829447e-07, "loss": 0.2119, "step": 1567 }, { "epoch": 13.401709401709402, "grad_norm": 1.0303893972646192, "learning_rate": 1.387648767054961e-07, "loss": 0.2094, "step": 1568 }, { "epoch": 13.41025641025641, "grad_norm": 0.8281095566416786, "learning_rate": 1.3729842874661365e-07, "loss": 0.3305, "step": 1569 }, { "epoch": 13.418803418803419, "grad_norm": 1.094637661606762, "learning_rate": 1.35839551805369e-07, "loss": 0.3879, "step": 1570 }, { "epoch": 13.427350427350428, "grad_norm": 1.1817341481124783, "learning_rate": 1.3438825055657246e-07, "loss": 0.2283, "step": 1571 }, { "epoch": 13.435897435897436, "grad_norm": 1.167606047139076, "learning_rate": 1.3294452965076031e-07, "loss": 0.3251, "step": 1572 }, { "epoch": 13.444444444444445, "grad_norm": 1.0632336244379381, "learning_rate": 1.31508393714177e-07, "loss": 0.3327, "step": 1573 }, { "epoch": 13.452991452991453, "grad_norm": 1.1356919410815811, "learning_rate": 1.3007984734876217e-07, "loss": 0.1932, "step": 1574 }, { "epoch": 13.461538461538462, "grad_norm": 1.1019667814822336, "learning_rate": 1.286588951321363e-07, "loss": 0.4479, "step": 1575 }, { "epoch": 13.47008547008547, "grad_norm": 1.2456887117109419, "learning_rate": 1.272455416175844e-07, "loss": 0.2, "step": 1576 }, { "epoch": 13.478632478632479, "grad_norm": 0.8768840020349393, "learning_rate": 1.258397913340423e-07, "loss": 0.1898, "step": 1577 }, { "epoch": 13.487179487179487, "grad_norm": 0.8610819031956608, "learning_rate": 1.2444164878608307e-07, "loss": 0.1477, "step": 1578 }, { "epoch": 13.495726495726496, "grad_norm": 1.0823294789754605, "learning_rate": 1.2305111845390043e-07, "loss": 0.1623, "step": 1579 }, { "epoch": 13.504273504273504, "grad_norm": 0.9631874294889519, "learning_rate": 1.2166820479329572e-07, "loss": 0.3338, "step": 1580 }, { "epoch": 13.512820512820513, "grad_norm": 1.0530613205307584, "learning_rate": 1.2029291223566413e-07, "loss": 0.1973, "step": 1581 }, { "epoch": 13.521367521367521, "grad_norm": 0.8717117935351345, "learning_rate": 1.1892524518797993e-07, "loss": 0.2069, "step": 1582 }, { "epoch": 13.52991452991453, "grad_norm": 0.9608587841521585, "learning_rate": 1.1756520803278188e-07, "loss": 0.2495, "step": 1583 }, { "epoch": 13.538461538461538, "grad_norm": 1.5519958028368797, "learning_rate": 1.1621280512815941e-07, "loss": 0.171, "step": 1584 }, { "epoch": 13.547008547008547, "grad_norm": 0.9131328583735898, "learning_rate": 1.1486804080773878e-07, "loss": 0.2453, "step": 1585 }, { "epoch": 13.555555555555555, "grad_norm": 0.99728593250585, "learning_rate": 1.1353091938067024e-07, "loss": 0.2087, "step": 1586 }, { "epoch": 13.564102564102564, "grad_norm": 1.0702964515597055, "learning_rate": 1.1220144513161197e-07, "loss": 0.3104, "step": 1587 }, { "epoch": 13.572649572649572, "grad_norm": 1.077157969082805, "learning_rate": 1.1087962232071814e-07, "loss": 0.3807, "step": 1588 }, { "epoch": 13.581196581196581, "grad_norm": 1.1826954134832977, "learning_rate": 1.0956545518362532e-07, "loss": 0.2355, "step": 1589 }, { "epoch": 13.58974358974359, "grad_norm": 1.156905738771152, "learning_rate": 1.0825894793143721e-07, "loss": 0.1391, "step": 1590 }, { "epoch": 13.598290598290598, "grad_norm": 1.3506966457826057, "learning_rate": 1.0696010475071267e-07, "loss": 0.2835, "step": 1591 }, { "epoch": 13.606837606837606, "grad_norm": 1.3963385583754653, "learning_rate": 1.0566892980345245e-07, "loss": 0.4713, "step": 1592 }, { "epoch": 13.615384615384615, "grad_norm": 1.1187293180307003, "learning_rate": 1.0438542722708444e-07, "loss": 0.4176, "step": 1593 }, { "epoch": 13.623931623931623, "grad_norm": 1.2856600898305055, "learning_rate": 1.0310960113445179e-07, "loss": 0.4702, "step": 1594 }, { "epoch": 13.632478632478632, "grad_norm": 1.1958490213395467, "learning_rate": 1.0184145561379949e-07, "loss": 0.1315, "step": 1595 }, { "epoch": 13.64102564102564, "grad_norm": 0.7705316255345382, "learning_rate": 1.0058099472876004e-07, "loss": 0.1864, "step": 1596 }, { "epoch": 13.649572649572649, "grad_norm": 0.8840288370368232, "learning_rate": 9.932822251834173e-08, "loss": 0.338, "step": 1597 }, { "epoch": 13.658119658119658, "grad_norm": 1.0348235453397847, "learning_rate": 9.808314299691591e-08, "loss": 0.3332, "step": 1598 }, { "epoch": 13.666666666666666, "grad_norm": 1.1828653933998128, "learning_rate": 9.684576015420277e-08, "loss": 0.3737, "step": 1599 }, { "epoch": 13.675213675213675, "grad_norm": 1.0091289878163927, "learning_rate": 9.561607795526007e-08, "loss": 0.2419, "step": 1600 }, { "epoch": 13.683760683760683, "grad_norm": 1.0828857407343973, "learning_rate": 9.439410034046942e-08, "loss": 0.2528, "step": 1601 }, { "epoch": 13.692307692307692, "grad_norm": 1.2959615159059084, "learning_rate": 9.317983122552332e-08, "loss": 0.1915, "step": 1602 }, { "epoch": 13.7008547008547, "grad_norm": 1.379162456698023, "learning_rate": 9.197327450141402e-08, "loss": 0.3602, "step": 1603 }, { "epoch": 13.709401709401709, "grad_norm": 1.1832886065543922, "learning_rate": 9.077443403441994e-08, "loss": 0.3007, "step": 1604 }, { "epoch": 13.717948717948717, "grad_norm": 1.159394233790722, "learning_rate": 8.958331366609424e-08, "loss": 0.2803, "step": 1605 }, { "epoch": 13.726495726495726, "grad_norm": 0.9755941982498277, "learning_rate": 8.839991721325047e-08, "loss": 0.2278, "step": 1606 }, { "epoch": 13.735042735042736, "grad_norm": 1.1557656098603584, "learning_rate": 8.72242484679528e-08, "loss": 0.3131, "step": 1607 }, { "epoch": 13.743589743589745, "grad_norm": 1.1722877958343039, "learning_rate": 8.605631119750297e-08, "loss": 0.3569, "step": 1608 }, { "epoch": 13.752136752136753, "grad_norm": 0.9489527958140329, "learning_rate": 8.489610914442697e-08, "loss": 0.3719, "step": 1609 }, { "epoch": 13.760683760683762, "grad_norm": 0.9781463695410046, "learning_rate": 8.374364602646512e-08, "loss": 0.2721, "step": 1610 }, { "epoch": 13.76923076923077, "grad_norm": 1.2276231195845517, "learning_rate": 8.259892553655946e-08, "loss": 0.2396, "step": 1611 }, { "epoch": 13.777777777777779, "grad_norm": 0.8125741331372663, "learning_rate": 8.146195134284052e-08, "loss": 0.282, "step": 1612 }, { "epoch": 13.786324786324787, "grad_norm": 1.0716815459854674, "learning_rate": 8.033272708861673e-08, "loss": 0.3969, "step": 1613 }, { "epoch": 13.794871794871796, "grad_norm": 0.9978778449841479, "learning_rate": 7.921125639236416e-08, "loss": 0.2373, "step": 1614 }, { "epoch": 13.803418803418804, "grad_norm": 0.921081013180687, "learning_rate": 7.809754284771181e-08, "loss": 0.225, "step": 1615 }, { "epoch": 13.811965811965813, "grad_norm": 1.3841256382567415, "learning_rate": 7.699159002343248e-08, "loss": 0.2923, "step": 1616 }, { "epoch": 13.820512820512821, "grad_norm": 1.0531740575034831, "learning_rate": 7.589340146343077e-08, "loss": 0.1851, "step": 1617 }, { "epoch": 13.82905982905983, "grad_norm": 1.052402359782648, "learning_rate": 7.48029806867312e-08, "loss": 0.2215, "step": 1618 }, { "epoch": 13.837606837606838, "grad_norm": 0.9474502677015596, "learning_rate": 7.372033118746708e-08, "loss": 0.3537, "step": 1619 }, { "epoch": 13.846153846153847, "grad_norm": 1.0433066188917253, "learning_rate": 7.264545643486997e-08, "loss": 0.192, "step": 1620 }, { "epoch": 13.854700854700855, "grad_norm": 1.0503434380249135, "learning_rate": 7.157835987325807e-08, "loss": 0.4817, "step": 1621 }, { "epoch": 13.863247863247864, "grad_norm": 1.0208969742480898, "learning_rate": 7.051904492202472e-08, "loss": 0.1586, "step": 1622 }, { "epoch": 13.871794871794872, "grad_norm": 1.0691421470631344, "learning_rate": 6.946751497562909e-08, "loss": 0.3521, "step": 1623 }, { "epoch": 13.88034188034188, "grad_norm": 1.2245483605550895, "learning_rate": 6.842377340358252e-08, "loss": 0.5142, "step": 1624 }, { "epoch": 13.88888888888889, "grad_norm": 1.0065600482948194, "learning_rate": 6.738782355044048e-08, "loss": 0.3602, "step": 1625 }, { "epoch": 13.897435897435898, "grad_norm": 1.0654700012366067, "learning_rate": 6.635966873579063e-08, "loss": 0.3004, "step": 1626 }, { "epoch": 13.905982905982906, "grad_norm": 1.3042405801995822, "learning_rate": 6.5339312254242e-08, "loss": 0.3589, "step": 1627 }, { "epoch": 13.914529914529915, "grad_norm": 1.1138127249535994, "learning_rate": 6.432675737541499e-08, "loss": 0.3743, "step": 1628 }, { "epoch": 13.923076923076923, "grad_norm": 0.809005507517847, "learning_rate": 6.332200734393057e-08, "loss": 0.1839, "step": 1629 }, { "epoch": 13.931623931623932, "grad_norm": 1.0368280042421736, "learning_rate": 6.232506537939942e-08, "loss": 0.1791, "step": 1630 }, { "epoch": 13.94017094017094, "grad_norm": 1.0270604550679647, "learning_rate": 6.13359346764128e-08, "loss": 0.4125, "step": 1631 }, { "epoch": 13.948717948717949, "grad_norm": 1.1754201805715523, "learning_rate": 6.035461840453116e-08, "loss": 0.4687, "step": 1632 }, { "epoch": 13.957264957264957, "grad_norm": 0.8604966197339347, "learning_rate": 5.938111970827526e-08, "loss": 0.2878, "step": 1633 }, { "epoch": 13.965811965811966, "grad_norm": 1.1086668737564982, "learning_rate": 5.841544170711422e-08, "loss": 0.1041, "step": 1634 }, { "epoch": 13.974358974358974, "grad_norm": 0.9009406977063691, "learning_rate": 5.745758749545749e-08, "loss": 0.1912, "step": 1635 }, { "epoch": 13.982905982905983, "grad_norm": 1.1802740851626485, "learning_rate": 5.650756014264347e-08, "loss": 0.4936, "step": 1636 }, { "epoch": 13.991452991452991, "grad_norm": 1.2300056190706103, "learning_rate": 5.556536269293006e-08, "loss": 0.4424, "step": 1637 }, { "epoch": 14.0, "grad_norm": 1.0137995981094534, "learning_rate": 5.463099816548578e-08, "loss": 0.3284, "step": 1638 }, { "epoch": 14.008547008547009, "grad_norm": 0.9709594304196405, "learning_rate": 5.3704469554379527e-08, "loss": 0.2297, "step": 1639 }, { "epoch": 14.017094017094017, "grad_norm": 1.1299871999082094, "learning_rate": 5.278577982857025e-08, "loss": 0.3918, "step": 1640 }, { "epoch": 14.025641025641026, "grad_norm": 1.1449136156565218, "learning_rate": 5.1874931931897854e-08, "loss": 0.3966, "step": 1641 }, { "epoch": 14.034188034188034, "grad_norm": 0.9844915462506715, "learning_rate": 5.097192878307455e-08, "loss": 0.2968, "step": 1642 }, { "epoch": 14.042735042735043, "grad_norm": 1.0431617240132687, "learning_rate": 5.0076773275675174e-08, "loss": 0.4419, "step": 1643 }, { "epoch": 14.051282051282051, "grad_norm": 1.0470663059857832, "learning_rate": 4.91894682781266e-08, "loss": 0.2256, "step": 1644 }, { "epoch": 14.05982905982906, "grad_norm": 1.0482134797568037, "learning_rate": 4.831001663370083e-08, "loss": 0.4594, "step": 1645 }, { "epoch": 14.068376068376068, "grad_norm": 0.9581719946948042, "learning_rate": 4.743842116050334e-08, "loss": 0.2997, "step": 1646 }, { "epoch": 14.076923076923077, "grad_norm": 1.1271714744759658, "learning_rate": 4.657468465146642e-08, "loss": 0.3799, "step": 1647 }, { "epoch": 14.085470085470085, "grad_norm": 0.9991717412883093, "learning_rate": 4.571880987433886e-08, "loss": 0.3411, "step": 1648 }, { "epoch": 14.094017094017094, "grad_norm": 0.9440052310542084, "learning_rate": 4.487079957167767e-08, "loss": 0.2588, "step": 1649 }, { "epoch": 14.102564102564102, "grad_norm": 1.1730488791362796, "learning_rate": 4.40306564608381e-08, "loss": 0.264, "step": 1650 }, { "epoch": 14.11111111111111, "grad_norm": 1.3216740626779884, "learning_rate": 4.319838323396691e-08, "loss": 0.1883, "step": 1651 }, { "epoch": 14.11965811965812, "grad_norm": 0.7823448055638057, "learning_rate": 4.237398255799191e-08, "loss": 0.1273, "step": 1652 }, { "epoch": 14.128205128205128, "grad_norm": 0.9551378725158102, "learning_rate": 4.155745707461467e-08, "loss": 0.3024, "step": 1653 }, { "epoch": 14.136752136752136, "grad_norm": 1.0106178858369212, "learning_rate": 4.0748809400301403e-08, "loss": 0.3491, "step": 1654 }, { "epoch": 14.145299145299145, "grad_norm": 1.3301723397017644, "learning_rate": 3.994804212627462e-08, "loss": 0.4849, "step": 1655 }, { "epoch": 14.153846153846153, "grad_norm": 1.0907574351446336, "learning_rate": 3.9155157818505654e-08, "loss": 0.2302, "step": 1656 }, { "epoch": 14.162393162393162, "grad_norm": 1.0599072945861983, "learning_rate": 3.8370159017704636e-08, "loss": 0.4176, "step": 1657 }, { "epoch": 14.17094017094017, "grad_norm": 1.1921847870816986, "learning_rate": 3.759304823931359e-08, "loss": 0.4857, "step": 1658 }, { "epoch": 14.179487179487179, "grad_norm": 1.0941612458721797, "learning_rate": 3.682382797349976e-08, "loss": 0.4745, "step": 1659 }, { "epoch": 14.188034188034187, "grad_norm": 1.1544133425051988, "learning_rate": 3.6062500685143943e-08, "loss": 0.2755, "step": 1660 }, { "epoch": 14.196581196581196, "grad_norm": 1.2723721479857397, "learning_rate": 3.5309068813836056e-08, "loss": 0.6096, "step": 1661 }, { "epoch": 14.205128205128204, "grad_norm": 1.047213471583824, "learning_rate": 3.4563534773866256e-08, "loss": 0.4001, "step": 1662 }, { "epoch": 14.213675213675213, "grad_norm": 1.1591614287101293, "learning_rate": 3.382590095421606e-08, "loss": 0.2664, "step": 1663 }, { "epoch": 14.222222222222221, "grad_norm": 0.9914669953998568, "learning_rate": 3.309616971855195e-08, "loss": 0.1926, "step": 1664 }, { "epoch": 14.23076923076923, "grad_norm": 1.0844370003359844, "learning_rate": 3.237434340521789e-08, "loss": 0.2233, "step": 1665 }, { "epoch": 14.239316239316238, "grad_norm": 1.0158802297687415, "learning_rate": 3.166042432722671e-08, "loss": 0.2643, "step": 1666 }, { "epoch": 14.247863247863247, "grad_norm": 1.2925442972530232, "learning_rate": 3.095441477225347e-08, "loss": 0.2349, "step": 1667 }, { "epoch": 14.256410256410255, "grad_norm": 0.9924388816355043, "learning_rate": 3.025631700262877e-08, "loss": 0.2553, "step": 1668 }, { "epoch": 14.264957264957266, "grad_norm": 1.069284498726041, "learning_rate": 2.9566133255329864e-08, "loss": 0.3725, "step": 1669 }, { "epoch": 14.273504273504274, "grad_norm": 1.1929109155842428, "learning_rate": 2.888386574197488e-08, "loss": 0.2351, "step": 1670 }, { "epoch": 14.282051282051283, "grad_norm": 0.9617405737468387, "learning_rate": 2.8209516648814996e-08, "loss": 0.0816, "step": 1671 }, { "epoch": 14.290598290598291, "grad_norm": 0.9147539896413376, "learning_rate": 2.7543088136727792e-08, "loss": 0.3169, "step": 1672 }, { "epoch": 14.2991452991453, "grad_norm": 0.795512505038877, "learning_rate": 2.688458234121033e-08, "loss": 0.0757, "step": 1673 }, { "epoch": 14.307692307692308, "grad_norm": 0.9502257124356851, "learning_rate": 2.6234001372372196e-08, "loss": 0.3862, "step": 1674 }, { "epoch": 14.316239316239317, "grad_norm": 1.274230365165183, "learning_rate": 2.5591347314928572e-08, "loss": 0.2647, "step": 1675 }, { "epoch": 14.324786324786325, "grad_norm": 0.967532783276075, "learning_rate": 2.495662222819356e-08, "loss": 0.3305, "step": 1676 }, { "epoch": 14.333333333333334, "grad_norm": 1.3224798768697412, "learning_rate": 2.4329828146074096e-08, "loss": 0.3332, "step": 1677 }, { "epoch": 14.341880341880342, "grad_norm": 1.0727985919335998, "learning_rate": 2.3710967077063275e-08, "loss": 0.3429, "step": 1678 }, { "epoch": 14.350427350427351, "grad_norm": 1.2004949487299452, "learning_rate": 2.310004100423313e-08, "loss": 0.287, "step": 1679 }, { "epoch": 14.35897435897436, "grad_norm": 1.0812921492853662, "learning_rate": 2.2497051885228825e-08, "loss": 0.3371, "step": 1680 }, { "epoch": 14.367521367521368, "grad_norm": 0.9348664846892308, "learning_rate": 2.190200165226336e-08, "loss": 0.1468, "step": 1681 }, { "epoch": 14.376068376068377, "grad_norm": 0.979520377810362, "learning_rate": 2.131489221210953e-08, "loss": 0.2964, "step": 1682 }, { "epoch": 14.384615384615385, "grad_norm": 0.9953585036774258, "learning_rate": 2.0735725446094924e-08, "loss": 0.2317, "step": 1683 }, { "epoch": 14.393162393162394, "grad_norm": 1.0211981474940792, "learning_rate": 2.016450321009611e-08, "loss": 0.2613, "step": 1684 }, { "epoch": 14.401709401709402, "grad_norm": 0.9977623186417789, "learning_rate": 1.9601227334531958e-08, "loss": 0.2357, "step": 1685 }, { "epoch": 14.41025641025641, "grad_norm": 1.0577263179561622, "learning_rate": 1.904589962435782e-08, "loss": 0.328, "step": 1686 }, { "epoch": 14.418803418803419, "grad_norm": 0.9932052880875452, "learning_rate": 1.8498521859060814e-08, "loss": 0.2766, "step": 1687 }, { "epoch": 14.427350427350428, "grad_norm": 1.2058132888141184, "learning_rate": 1.795909579265259e-08, "loss": 0.4555, "step": 1688 }, { "epoch": 14.435897435897436, "grad_norm": 0.9202271237956986, "learning_rate": 1.7427623153664364e-08, "loss": 0.2043, "step": 1689 }, { "epoch": 14.444444444444445, "grad_norm": 1.0762171371983105, "learning_rate": 1.6904105645142443e-08, "loss": 0.39, "step": 1690 }, { "epoch": 14.452991452991453, "grad_norm": 1.1389216220816722, "learning_rate": 1.638854494464104e-08, "loss": 0.2034, "step": 1691 }, { "epoch": 14.461538461538462, "grad_norm": 0.8740065264182487, "learning_rate": 1.5880942704217528e-08, "loss": 0.2881, "step": 1692 }, { "epoch": 14.47008547008547, "grad_norm": 0.7538828934098073, "learning_rate": 1.5381300550427748e-08, "loss": 0.1935, "step": 1693 }, { "epoch": 14.478632478632479, "grad_norm": 0.9410330802083953, "learning_rate": 1.4889620084319878e-08, "loss": 0.3392, "step": 1694 }, { "epoch": 14.487179487179487, "grad_norm": 1.0121163042552828, "learning_rate": 1.4405902881430289e-08, "loss": 0.1335, "step": 1695 }, { "epoch": 14.495726495726496, "grad_norm": 1.072963591656099, "learning_rate": 1.393015049177715e-08, "loss": 0.2879, "step": 1696 }, { "epoch": 14.504273504273504, "grad_norm": 0.9404008286999908, "learning_rate": 1.3462364439857379e-08, "loss": 0.3634, "step": 1697 }, { "epoch": 14.512820512820513, "grad_norm": 1.075575665244736, "learning_rate": 1.3002546224639146e-08, "loss": 0.3437, "step": 1698 }, { "epoch": 14.521367521367521, "grad_norm": 0.9955741016463133, "learning_rate": 1.2550697319560211e-08, "loss": 0.521, "step": 1699 }, { "epoch": 14.52991452991453, "grad_norm": 0.9076329314518284, "learning_rate": 1.2106819172520434e-08, "loss": 0.3091, "step": 1700 }, { "epoch": 14.538461538461538, "grad_norm": 0.9706053653755294, "learning_rate": 1.1670913205878431e-08, "loss": 0.2192, "step": 1701 }, { "epoch": 14.547008547008547, "grad_norm": 0.9722292014471331, "learning_rate": 1.1242980816447147e-08, "loss": 0.1934, "step": 1702 }, { "epoch": 14.555555555555555, "grad_norm": 1.2950023777523025, "learning_rate": 1.0823023375489128e-08, "loss": 0.2387, "step": 1703 }, { "epoch": 14.564102564102564, "grad_norm": 0.9826157475331373, "learning_rate": 1.0411042228711254e-08, "loss": 0.1015, "step": 1704 }, { "epoch": 14.572649572649572, "grad_norm": 1.202253302780247, "learning_rate": 1.0007038696262517e-08, "loss": 0.4506, "step": 1705 }, { "epoch": 14.581196581196581, "grad_norm": 1.0946650709225376, "learning_rate": 9.611014072727354e-09, "loss": 0.246, "step": 1706 }, { "epoch": 14.58974358974359, "grad_norm": 1.0774018997500412, "learning_rate": 9.222969627123435e-09, "loss": 0.4581, "step": 1707 }, { "epoch": 14.598290598290598, "grad_norm": 1.3051886054710957, "learning_rate": 8.842906602896661e-09, "loss": 0.2653, "step": 1708 }, { "epoch": 14.606837606837606, "grad_norm": 0.9800272459228091, "learning_rate": 8.470826217917006e-09, "loss": 0.1108, "step": 1709 }, { "epoch": 14.615384615384615, "grad_norm": 0.9277483890590144, "learning_rate": 8.106729664475178e-09, "loss": 0.4336, "step": 1710 }, { "epoch": 14.623931623931623, "grad_norm": 1.0716060943444674, "learning_rate": 7.750618109278464e-09, "loss": 0.2783, "step": 1711 }, { "epoch": 14.632478632478632, "grad_norm": 0.9851690922805892, "learning_rate": 7.402492693447671e-09, "loss": 0.1646, "step": 1712 }, { "epoch": 14.64102564102564, "grad_norm": 1.491798773298588, "learning_rate": 7.062354532512416e-09, "loss": 0.5989, "step": 1713 }, { "epoch": 14.649572649572649, "grad_norm": 0.9900391674662283, "learning_rate": 6.730204716407507e-09, "loss": 0.3693, "step": 1714 }, { "epoch": 14.658119658119658, "grad_norm": 1.0409210475517114, "learning_rate": 6.406044309471004e-09, "loss": 0.1636, "step": 1715 }, { "epoch": 14.666666666666666, "grad_norm": 1.1496312917251512, "learning_rate": 6.089874350439507e-09, "loss": 0.3444, "step": 1716 }, { "epoch": 14.675213675213675, "grad_norm": 1.2119259909711284, "learning_rate": 5.781695852444258e-09, "loss": 0.4824, "step": 1717 }, { "epoch": 14.683760683760683, "grad_norm": 1.098314603031555, "learning_rate": 5.481509803009766e-09, "loss": 0.2783, "step": 1718 }, { "epoch": 14.692307692307692, "grad_norm": 0.7245471807790067, "learning_rate": 5.189317164049634e-09, "loss": 0.0919, "step": 1719 }, { "epoch": 14.7008547008547, "grad_norm": 1.09639322805763, "learning_rate": 4.905118871862402e-09, "loss": 0.3483, "step": 1720 }, { "epoch": 14.709401709401709, "grad_norm": 1.0581128651122704, "learning_rate": 4.62891583713071e-09, "loss": 0.09, "step": 1721 }, { "epoch": 14.717948717948717, "grad_norm": 1.0395913771941103, "learning_rate": 4.3607089449165806e-09, "loss": 0.3622, "step": 1722 }, { "epoch": 14.726495726495726, "grad_norm": 1.1486440562166906, "learning_rate": 4.100499054659757e-09, "loss": 0.2662, "step": 1723 }, { "epoch": 14.735042735042736, "grad_norm": 0.8951556891047434, "learning_rate": 3.848287000174089e-09, "loss": 0.3152, "step": 1724 }, { "epoch": 14.743589743589745, "grad_norm": 1.139371446672755, "learning_rate": 3.6040735896455957e-09, "loss": 0.273, "step": 1725 }, { "epoch": 14.752136752136753, "grad_norm": 0.8938981908344934, "learning_rate": 3.367859605628854e-09, "loss": 0.2162, "step": 1726 }, { "epoch": 14.760683760683762, "grad_norm": 1.1693798769445003, "learning_rate": 3.139645805046165e-09, "loss": 0.2869, "step": 1727 }, { "epoch": 14.76923076923077, "grad_norm": 0.9333554324995236, "learning_rate": 2.919432919183396e-09, "loss": 0.1761, "step": 1728 }, { "epoch": 14.777777777777779, "grad_norm": 1.24121704933367, "learning_rate": 2.7072216536885855e-09, "loss": 0.3746, "step": 1729 }, { "epoch": 14.786324786324787, "grad_norm": 1.1976434166923222, "learning_rate": 2.5030126885694505e-09, "loss": 0.4664, "step": 1730 }, { "epoch": 14.794871794871796, "grad_norm": 0.7888451308473411, "learning_rate": 2.3068066781908873e-09, "loss": 0.1526, "step": 1731 }, { "epoch": 14.803418803418804, "grad_norm": 1.0609440995043917, "learning_rate": 2.118604251273859e-09, "loss": 0.1695, "step": 1732 }, { "epoch": 14.811965811965813, "grad_norm": 1.1284468963364922, "learning_rate": 1.9384060108923463e-09, "loss": 0.4076, "step": 1733 }, { "epoch": 14.820512820512821, "grad_norm": 0.9200801478575613, "learning_rate": 1.766212534471401e-09, "loss": 0.1048, "step": 1734 }, { "epoch": 14.82905982905983, "grad_norm": 1.0748364878920569, "learning_rate": 1.6020243737865926e-09, "loss": 0.4537, "step": 1735 }, { "epoch": 14.837606837606838, "grad_norm": 1.111423242662358, "learning_rate": 1.4458420549606777e-09, "loss": 0.2354, "step": 1736 }, { "epoch": 14.846153846153847, "grad_norm": 0.9085030432997364, "learning_rate": 1.297666078462767e-09, "loss": 0.4356, "step": 1737 }, { "epoch": 14.854700854700855, "grad_norm": 0.7275980986093856, "learning_rate": 1.1574969191061047e-09, "loss": 0.1204, "step": 1738 }, { "epoch": 14.863247863247864, "grad_norm": 0.9569122217056513, "learning_rate": 1.0253350260480688e-09, "loss": 0.1361, "step": 1739 }, { "epoch": 14.871794871794872, "grad_norm": 1.1601832719859042, "learning_rate": 9.011808227865626e-10, "loss": 0.384, "step": 1740 }, { "epoch": 14.88034188034188, "grad_norm": 0.884213582013597, "learning_rate": 7.850347071597376e-10, "loss": 0.2421, "step": 1741 }, { "epoch": 14.88888888888889, "grad_norm": 0.8832450863042082, "learning_rate": 6.768970513457151e-10, "loss": 0.1231, "step": 1742 }, { "epoch": 14.897435897435898, "grad_norm": 1.3622457616085866, "learning_rate": 5.767682018595344e-10, "loss": 0.3513, "step": 1743 }, { "epoch": 14.905982905982906, "grad_norm": 1.055014911090151, "learning_rate": 4.846484795528739e-10, "loss": 0.2323, "step": 1744 }, { "epoch": 14.914529914529915, "grad_norm": 0.9330647451830272, "learning_rate": 4.0053817961321903e-10, "loss": 0.1243, "step": 1745 }, { "epoch": 14.923076923076923, "grad_norm": 0.9676723236179351, "learning_rate": 3.2443757156330746e-10, "loss": 0.2195, "step": 1746 }, { "epoch": 14.931623931623932, "grad_norm": 1.0828377737529897, "learning_rate": 2.563468992586304e-10, "loss": 0.3462, "step": 1747 }, { "epoch": 14.94017094017094, "grad_norm": 0.8574676515336324, "learning_rate": 1.9626638088854344e-10, "loss": 0.1641, "step": 1748 }, { "epoch": 14.948717948717949, "grad_norm": 0.9772627888768189, "learning_rate": 1.4419620897432318e-10, "loss": 0.1304, "step": 1749 }, { "epoch": 14.957264957264957, "grad_norm": 0.8578581155444094, "learning_rate": 1.0013655036916758e-10, "loss": 0.2403, "step": 1750 }, { "epoch": 14.965811965811966, "grad_norm": 0.9304603448605594, "learning_rate": 6.408754625736313e-11, "loss": 0.0621, "step": 1751 }, { "epoch": 14.974358974358974, "grad_norm": 1.1238292040246003, "learning_rate": 3.604931215400731e-11, "loss": 0.3074, "step": 1752 }, { "epoch": 14.982905982905983, "grad_norm": 1.1079431347000677, "learning_rate": 1.6021937904731054e-11, "loss": 0.3654, "step": 1753 }, { "epoch": 14.991452991452991, "grad_norm": 1.0356429914866991, "learning_rate": 4.005487684866083e-12, "loss": 0.2706, "step": 1754 }, { "epoch": 15.0, "grad_norm": 1.20860366920909, "learning_rate": 0.0, "loss": 0.3097, "step": 1755 }, { "epoch": 15.0, "step": 1755, "total_flos": 74086762168320.0, "train_loss": 0.44032181025060835, "train_runtime": 4826.2525, "train_samples_per_second": 2.539, "train_steps_per_second": 0.364 } ], "logging_steps": 1, "max_steps": 1755, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 74086762168320.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }