{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.739913097454997, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012414649286157667, "grad_norm": 4.2440571784973145, "learning_rate": 4.999864283954701e-05, "loss": 3.2577, "num_input_tokens_seen": 37232, "step": 5 }, { "epoch": 0.024829298572315334, "grad_norm": 3.701024055480957, "learning_rate": 4.99931296277454e-05, "loss": 2.2677, "num_input_tokens_seen": 73136, "step": 10 }, { "epoch": 0.037243947858473, "grad_norm": 1.1576040983200073, "learning_rate": 4.998337647663173e-05, "loss": 1.1294, "num_input_tokens_seen": 107680, "step": 15 }, { "epoch": 0.04965859714463067, "grad_norm": 0.9847909808158875, "learning_rate": 4.9969385040771445e-05, "loss": 0.6708, "num_input_tokens_seen": 142864, "step": 20 }, { "epoch": 0.06207324643078833, "grad_norm": 1.115911841392517, "learning_rate": 4.99511576937304e-05, "loss": 0.7132, "num_input_tokens_seen": 182192, "step": 25 }, { "epoch": 0.074487895716946, "grad_norm": 0.7898509502410889, "learning_rate": 4.992869752767218e-05, "loss": 0.5369, "num_input_tokens_seen": 214848, "step": 30 }, { "epoch": 0.08690254500310367, "grad_norm": 0.7674874067306519, "learning_rate": 4.990200835283353e-05, "loss": 0.7403, "num_input_tokens_seen": 246032, "step": 35 }, { "epoch": 0.09931719428926133, "grad_norm": 0.7663428783416748, "learning_rate": 4.9871094696877995e-05, "loss": 0.6055, "num_input_tokens_seen": 285488, "step": 40 }, { "epoch": 0.11173184357541899, "grad_norm": 0.8450624942779541, "learning_rate": 4.983596180412778e-05, "loss": 0.5864, "num_input_tokens_seen": 321376, "step": 45 }, { "epoch": 0.12414649286157665, "grad_norm": 0.851875364780426, "learning_rate": 4.9796615634674155e-05, "loss": 0.657, "num_input_tokens_seen": 362464, "step": 50 }, { "epoch": 0.13656114214773432, "grad_norm": 1.1010278463363647, "learning_rate": 4.9753062863366276e-05, "loss": 0.6528, "num_input_tokens_seen": 401136, "step": 55 }, { "epoch": 0.148975791433892, "grad_norm": 0.9413073062896729, "learning_rate": 4.97053108786789e-05, "loss": 0.5494, "num_input_tokens_seen": 439184, "step": 60 }, { "epoch": 0.16139044072004965, "grad_norm": 1.122977614402771, "learning_rate": 4.965336778145895e-05, "loss": 0.5638, "num_input_tokens_seen": 481472, "step": 65 }, { "epoch": 0.17380509000620734, "grad_norm": 0.8132630586624146, "learning_rate": 4.959724238355123e-05, "loss": 0.6434, "num_input_tokens_seen": 516224, "step": 70 }, { "epoch": 0.186219739292365, "grad_norm": 0.6980988383293152, "learning_rate": 4.953694420630361e-05, "loss": 0.6215, "num_input_tokens_seen": 552096, "step": 75 }, { "epoch": 0.19863438857852267, "grad_norm": 0.9601867198944092, "learning_rate": 4.947248347895172e-05, "loss": 0.4621, "num_input_tokens_seen": 586336, "step": 80 }, { "epoch": 0.21104903786468032, "grad_norm": 0.5580036640167236, "learning_rate": 4.940387113688363e-05, "loss": 0.4003, "num_input_tokens_seen": 621744, "step": 85 }, { "epoch": 0.22346368715083798, "grad_norm": 0.8909018635749817, "learning_rate": 4.9331118819784773e-05, "loss": 0.4919, "num_input_tokens_seen": 660800, "step": 90 }, { "epoch": 0.23587833643699566, "grad_norm": 0.9431052207946777, "learning_rate": 4.925423886966328e-05, "loss": 0.3287, "num_input_tokens_seen": 699120, "step": 95 }, { "epoch": 0.2482929857231533, "grad_norm": 0.761715292930603, "learning_rate": 4.917324432875627e-05, "loss": 0.3702, "num_input_tokens_seen": 734592, "step": 100 }, { "epoch": 0.260707635009311, "grad_norm": 0.8547595143318176, "learning_rate": 4.908814893731728e-05, "loss": 0.6426, "num_input_tokens_seen": 767808, "step": 105 }, { "epoch": 0.27312228429546864, "grad_norm": 0.5668878555297852, "learning_rate": 4.8998967131285356e-05, "loss": 0.3547, "num_input_tokens_seen": 808944, "step": 110 }, { "epoch": 0.2855369335816263, "grad_norm": 1.570143699645996, "learning_rate": 4.890571403983603e-05, "loss": 0.6513, "num_input_tokens_seen": 843824, "step": 115 }, { "epoch": 0.297951582867784, "grad_norm": 1.070682406425476, "learning_rate": 4.880840548281475e-05, "loss": 0.4612, "num_input_tokens_seen": 880896, "step": 120 }, { "epoch": 0.31036623215394166, "grad_norm": 1.1252045631408691, "learning_rate": 4.8707057968053175e-05, "loss": 0.4602, "num_input_tokens_seen": 920016, "step": 125 }, { "epoch": 0.3227808814400993, "grad_norm": 0.8112793564796448, "learning_rate": 4.8601688688568695e-05, "loss": 0.4327, "num_input_tokens_seen": 954688, "step": 130 }, { "epoch": 0.33519553072625696, "grad_norm": 1.0317658185958862, "learning_rate": 4.849231551964771e-05, "loss": 0.5646, "num_input_tokens_seen": 993088, "step": 135 }, { "epoch": 0.34761018001241467, "grad_norm": 1.267600655555725, "learning_rate": 4.8378957015813225e-05, "loss": 0.485, "num_input_tokens_seen": 1032992, "step": 140 }, { "epoch": 0.3600248292985723, "grad_norm": 0.811553955078125, "learning_rate": 4.8261632407677174e-05, "loss": 0.3628, "num_input_tokens_seen": 1069440, "step": 145 }, { "epoch": 0.37243947858473, "grad_norm": 1.0823220014572144, "learning_rate": 4.814036159867803e-05, "loss": 0.4573, "num_input_tokens_seen": 1106464, "step": 150 }, { "epoch": 0.38485412787088763, "grad_norm": 1.234602689743042, "learning_rate": 4.8015165161704375e-05, "loss": 0.4317, "num_input_tokens_seen": 1140432, "step": 155 }, { "epoch": 0.39726877715704534, "grad_norm": 1.0161981582641602, "learning_rate": 4.788606433560474e-05, "loss": 0.4032, "num_input_tokens_seen": 1172608, "step": 160 }, { "epoch": 0.409683426443203, "grad_norm": 1.0053682327270508, "learning_rate": 4.775308102158461e-05, "loss": 0.427, "num_input_tokens_seen": 1206800, "step": 165 }, { "epoch": 0.42209807572936064, "grad_norm": 1.164704442024231, "learning_rate": 4.761623777949102e-05, "loss": 0.3341, "num_input_tokens_seen": 1246208, "step": 170 }, { "epoch": 0.4345127250155183, "grad_norm": 1.2743600606918335, "learning_rate": 4.747555782398537e-05, "loss": 0.4546, "num_input_tokens_seen": 1288384, "step": 175 }, { "epoch": 0.44692737430167595, "grad_norm": 1.1493018865585327, "learning_rate": 4.7331065020605204e-05, "loss": 0.3587, "num_input_tokens_seen": 1323152, "step": 180 }, { "epoch": 0.45934202358783366, "grad_norm": 1.6683332920074463, "learning_rate": 4.71827838817156e-05, "loss": 0.3688, "num_input_tokens_seen": 1362528, "step": 185 }, { "epoch": 0.4717566728739913, "grad_norm": 1.0220803022384644, "learning_rate": 4.7030739562350713e-05, "loss": 0.3573, "num_input_tokens_seen": 1397920, "step": 190 }, { "epoch": 0.48417132216014896, "grad_norm": 1.4865689277648926, "learning_rate": 4.6874957855946455e-05, "loss": 0.3767, "num_input_tokens_seen": 1436896, "step": 195 }, { "epoch": 0.4965859714463066, "grad_norm": 1.2456483840942383, "learning_rate": 4.6715465189964724e-05, "loss": 0.2812, "num_input_tokens_seen": 1471152, "step": 200 }, { "epoch": 0.5090006207324643, "grad_norm": 1.8806102275848389, "learning_rate": 4.655228862141017e-05, "loss": 0.4907, "num_input_tokens_seen": 1505184, "step": 205 }, { "epoch": 0.521415270018622, "grad_norm": 1.122801423072815, "learning_rate": 4.638545583224011e-05, "loss": 0.3227, "num_input_tokens_seen": 1542048, "step": 210 }, { "epoch": 0.5338299193047796, "grad_norm": 1.8667101860046387, "learning_rate": 4.621499512466847e-05, "loss": 0.3973, "num_input_tokens_seen": 1581072, "step": 215 }, { "epoch": 0.5462445685909373, "grad_norm": 1.7394258975982666, "learning_rate": 4.604093541636447e-05, "loss": 0.3103, "num_input_tokens_seen": 1617216, "step": 220 }, { "epoch": 0.5586592178770949, "grad_norm": 1.7145013809204102, "learning_rate": 4.586330623554691e-05, "loss": 0.295, "num_input_tokens_seen": 1654720, "step": 225 }, { "epoch": 0.5710738671632526, "grad_norm": 1.7859894037246704, "learning_rate": 4.5682137715974835e-05, "loss": 0.4494, "num_input_tokens_seen": 1690112, "step": 230 }, { "epoch": 0.5834885164494104, "grad_norm": 1.4137821197509766, "learning_rate": 4.5497460591835615e-05, "loss": 0.3657, "num_input_tokens_seen": 1726288, "step": 235 }, { "epoch": 0.595903165735568, "grad_norm": 1.8285645246505737, "learning_rate": 4.530930619253097e-05, "loss": 0.3456, "num_input_tokens_seen": 1765376, "step": 240 }, { "epoch": 0.6083178150217257, "grad_norm": 1.5978659391403198, "learning_rate": 4.5117706437362176e-05, "loss": 0.2501, "num_input_tokens_seen": 1799248, "step": 245 }, { "epoch": 0.6207324643078833, "grad_norm": 2.195641279220581, "learning_rate": 4.492269383011512e-05, "loss": 0.349, "num_input_tokens_seen": 1836928, "step": 250 }, { "epoch": 0.633147113594041, "grad_norm": 2.489474296569824, "learning_rate": 4.472430145354622e-05, "loss": 0.3266, "num_input_tokens_seen": 1874848, "step": 255 }, { "epoch": 0.6455617628801986, "grad_norm": 1.9080901145935059, "learning_rate": 4.452256296377017e-05, "loss": 0.3149, "num_input_tokens_seen": 1911216, "step": 260 }, { "epoch": 0.6579764121663563, "grad_norm": 1.6012259721755981, "learning_rate": 4.431751258455029e-05, "loss": 0.2729, "num_input_tokens_seen": 1946304, "step": 265 }, { "epoch": 0.6703910614525139, "grad_norm": 1.1568175554275513, "learning_rate": 4.4109185101492735e-05, "loss": 0.1484, "num_input_tokens_seen": 1981616, "step": 270 }, { "epoch": 0.6828057107386716, "grad_norm": 2.3041481971740723, "learning_rate": 4.38976158561453e-05, "loss": 0.309, "num_input_tokens_seen": 2025136, "step": 275 }, { "epoch": 0.6952203600248293, "grad_norm": 2.112865924835205, "learning_rate": 4.368284074000193e-05, "loss": 0.3137, "num_input_tokens_seen": 2061600, "step": 280 }, { "epoch": 0.707635009310987, "grad_norm": 2.451075792312622, "learning_rate": 4.346489618841393e-05, "loss": 0.2555, "num_input_tokens_seen": 2097392, "step": 285 }, { "epoch": 0.7200496585971446, "grad_norm": 1.9181469678878784, "learning_rate": 4.324381917440891e-05, "loss": 0.2131, "num_input_tokens_seen": 2134016, "step": 290 }, { "epoch": 0.7324643078833023, "grad_norm": 1.8552194833755493, "learning_rate": 4.3019647202418566e-05, "loss": 0.1754, "num_input_tokens_seen": 2173824, "step": 295 }, { "epoch": 0.74487895716946, "grad_norm": 2.7781271934509277, "learning_rate": 4.2792418301916224e-05, "loss": 0.1768, "num_input_tokens_seen": 2206256, "step": 300 }, { "epoch": 0.7572936064556176, "grad_norm": 3.7296838760375977, "learning_rate": 4.25621710209654e-05, "loss": 0.1965, "num_input_tokens_seen": 2240240, "step": 305 }, { "epoch": 0.7697082557417753, "grad_norm": 1.6878883838653564, "learning_rate": 4.23289444196803e-05, "loss": 0.2395, "num_input_tokens_seen": 2275648, "step": 310 }, { "epoch": 0.7821229050279329, "grad_norm": 1.6181930303573608, "learning_rate": 4.2092778063599555e-05, "loss": 0.2325, "num_input_tokens_seen": 2308336, "step": 315 }, { "epoch": 0.7945375543140907, "grad_norm": 2.1921615600585938, "learning_rate": 4.18537120169741e-05, "loss": 0.1968, "num_input_tokens_seen": 2343568, "step": 320 }, { "epoch": 0.8069522036002483, "grad_norm": 2.2584996223449707, "learning_rate": 4.161178683597054e-05, "loss": 0.1644, "num_input_tokens_seen": 2379472, "step": 325 }, { "epoch": 0.819366852886406, "grad_norm": 2.3837616443634033, "learning_rate": 4.1367043561791055e-05, "loss": 0.1877, "num_input_tokens_seen": 2419312, "step": 330 }, { "epoch": 0.8317815021725636, "grad_norm": 1.6383507251739502, "learning_rate": 4.1119523713710904e-05, "loss": 0.1101, "num_input_tokens_seen": 2453632, "step": 335 }, { "epoch": 0.8441961514587213, "grad_norm": 2.8826770782470703, "learning_rate": 4.0869269282035057e-05, "loss": 0.19, "num_input_tokens_seen": 2486976, "step": 340 }, { "epoch": 0.8566108007448789, "grad_norm": 1.9499619007110596, "learning_rate": 4.0616322720974664e-05, "loss": 0.1826, "num_input_tokens_seen": 2525936, "step": 345 }, { "epoch": 0.8690254500310366, "grad_norm": 2.0627527236938477, "learning_rate": 4.036072694144501e-05, "loss": 0.1074, "num_input_tokens_seen": 2564032, "step": 350 }, { "epoch": 0.8814400993171942, "grad_norm": 2.31044864654541, "learning_rate": 4.010252530378589e-05, "loss": 0.1336, "num_input_tokens_seen": 2599856, "step": 355 }, { "epoch": 0.8938547486033519, "grad_norm": 1.834965467453003, "learning_rate": 3.9841761610405845e-05, "loss": 0.092, "num_input_tokens_seen": 2639152, "step": 360 }, { "epoch": 0.9062693978895097, "grad_norm": 2.362949848175049, "learning_rate": 3.9578480098351244e-05, "loss": 0.1203, "num_input_tokens_seen": 2676240, "step": 365 }, { "epoch": 0.9186840471756673, "grad_norm": 3.110170841217041, "learning_rate": 3.93127254318018e-05, "loss": 0.1209, "num_input_tokens_seen": 2715968, "step": 370 }, { "epoch": 0.931098696461825, "grad_norm": 2.367720127105713, "learning_rate": 3.904454269449351e-05, "loss": 0.1817, "num_input_tokens_seen": 2756304, "step": 375 }, { "epoch": 0.9435133457479826, "grad_norm": 3.0095152854919434, "learning_rate": 3.87739773820705e-05, "loss": 0.1251, "num_input_tokens_seen": 2792416, "step": 380 }, { "epoch": 0.9559279950341403, "grad_norm": 1.9829216003417969, "learning_rate": 3.850107539436689e-05, "loss": 0.1037, "num_input_tokens_seen": 2828864, "step": 385 }, { "epoch": 0.9683426443202979, "grad_norm": 2.005993127822876, "learning_rate": 3.822588302762024e-05, "loss": 0.1396, "num_input_tokens_seen": 2871152, "step": 390 }, { "epoch": 0.9807572936064556, "grad_norm": 2.376305341720581, "learning_rate": 3.794844696661757e-05, "loss": 0.1243, "num_input_tokens_seen": 2907664, "step": 395 }, { "epoch": 0.9931719428926132, "grad_norm": 2.518366575241089, "learning_rate": 3.766881427677563e-05, "loss": 0.1032, "num_input_tokens_seen": 2939920, "step": 400 }, { "epoch": 1.0074487895716946, "grad_norm": 1.6247838735580444, "learning_rate": 3.73870323961565e-05, "loss": 0.0686, "num_input_tokens_seen": 2980064, "step": 405 }, { "epoch": 1.0198634388578522, "grad_norm": 1.916497826576233, "learning_rate": 3.710314912741997e-05, "loss": 0.0539, "num_input_tokens_seen": 3018608, "step": 410 }, { "epoch": 1.03227808814401, "grad_norm": 2.025754690170288, "learning_rate": 3.681721262971413e-05, "loss": 0.0622, "num_input_tokens_seen": 3057088, "step": 415 }, { "epoch": 1.0446927374301676, "grad_norm": 2.7112205028533936, "learning_rate": 3.652927141050548e-05, "loss": 0.0546, "num_input_tokens_seen": 3095968, "step": 420 }, { "epoch": 1.0571073867163252, "grad_norm": 2.102372884750366, "learning_rate": 3.623937431734982e-05, "loss": 0.0328, "num_input_tokens_seen": 3132352, "step": 425 }, { "epoch": 1.0695220360024829, "grad_norm": 1.3384644985198975, "learning_rate": 3.594757052960566e-05, "loss": 0.0403, "num_input_tokens_seen": 3169168, "step": 430 }, { "epoch": 1.0819366852886405, "grad_norm": 1.744563341140747, "learning_rate": 3.565390955009113e-05, "loss": 0.0415, "num_input_tokens_seen": 3205520, "step": 435 }, { "epoch": 1.0943513345747982, "grad_norm": 2.653256416320801, "learning_rate": 3.535844119668622e-05, "loss": 0.0405, "num_input_tokens_seen": 3241840, "step": 440 }, { "epoch": 1.106765983860956, "grad_norm": 1.7136708498001099, "learning_rate": 3.5061215593881345e-05, "loss": 0.0256, "num_input_tokens_seen": 3280336, "step": 445 }, { "epoch": 1.1191806331471137, "grad_norm": 1.9461147785186768, "learning_rate": 3.47622831642741e-05, "loss": 0.0438, "num_input_tokens_seen": 3315936, "step": 450 }, { "epoch": 1.1315952824332713, "grad_norm": 2.365481376647949, "learning_rate": 3.446169462001534e-05, "loss": 0.0432, "num_input_tokens_seen": 3353392, "step": 455 }, { "epoch": 1.144009931719429, "grad_norm": 2.80092716217041, "learning_rate": 3.415950095420616e-05, "loss": 0.0544, "num_input_tokens_seen": 3392928, "step": 460 }, { "epoch": 1.1564245810055866, "grad_norm": 1.879797101020813, "learning_rate": 3.385575343224718e-05, "loss": 0.0429, "num_input_tokens_seen": 3431088, "step": 465 }, { "epoch": 1.1688392302917443, "grad_norm": 2.7898995876312256, "learning_rate": 3.355050358314172e-05, "loss": 0.0389, "num_input_tokens_seen": 3463952, "step": 470 }, { "epoch": 1.181253879577902, "grad_norm": 2.2746829986572266, "learning_rate": 3.324380319075416e-05, "loss": 0.0417, "num_input_tokens_seen": 3503584, "step": 475 }, { "epoch": 1.1936685288640596, "grad_norm": 2.055482864379883, "learning_rate": 3.293570428502515e-05, "loss": 0.0588, "num_input_tokens_seen": 3534784, "step": 480 }, { "epoch": 1.2060831781502173, "grad_norm": 2.486926317214966, "learning_rate": 3.262625913314496e-05, "loss": 0.04, "num_input_tokens_seen": 3573136, "step": 485 }, { "epoch": 1.218497827436375, "grad_norm": 0.873005211353302, "learning_rate": 3.231552023068675e-05, "loss": 0.0321, "num_input_tokens_seen": 3612416, "step": 490 }, { "epoch": 1.2309124767225326, "grad_norm": 1.0674567222595215, "learning_rate": 3.200354029270091e-05, "loss": 0.026, "num_input_tokens_seen": 3651600, "step": 495 }, { "epoch": 1.2433271260086902, "grad_norm": 1.9875820875167847, "learning_rate": 3.1690372244772356e-05, "loss": 0.0289, "num_input_tokens_seen": 3687072, "step": 500 }, { "epoch": 1.2557417752948479, "grad_norm": 2.648414373397827, "learning_rate": 3.1376069214041913e-05, "loss": 0.0282, "num_input_tokens_seen": 3719376, "step": 505 }, { "epoch": 1.2681564245810055, "grad_norm": 1.0216617584228516, "learning_rate": 3.106068452019365e-05, "loss": 0.0513, "num_input_tokens_seen": 3754512, "step": 510 }, { "epoch": 1.2805710738671632, "grad_norm": 0.45367100834846497, "learning_rate": 3.0744271666409524e-05, "loss": 0.0215, "num_input_tokens_seen": 3790640, "step": 515 }, { "epoch": 1.292985723153321, "grad_norm": 1.669716238975525, "learning_rate": 3.0426884330292842e-05, "loss": 0.0265, "num_input_tokens_seen": 3826256, "step": 520 }, { "epoch": 1.3054003724394785, "grad_norm": 3.11692214012146, "learning_rate": 3.0108576354762175e-05, "loss": 0.0641, "num_input_tokens_seen": 3870096, "step": 525 }, { "epoch": 1.3178150217256364, "grad_norm": 1.151227355003357, "learning_rate": 2.9789401738917244e-05, "loss": 0.023, "num_input_tokens_seen": 3903392, "step": 530 }, { "epoch": 1.3302296710117938, "grad_norm": 1.889338493347168, "learning_rate": 2.946941462887824e-05, "loss": 0.0271, "num_input_tokens_seen": 3940016, "step": 535 }, { "epoch": 1.3426443202979517, "grad_norm": 1.8156658411026, "learning_rate": 2.9148669308600296e-05, "loss": 0.0238, "num_input_tokens_seen": 3977488, "step": 540 }, { "epoch": 1.3550589695841093, "grad_norm": 0.7474733591079712, "learning_rate": 2.8827220190664506e-05, "loss": 0.0147, "num_input_tokens_seen": 4015136, "step": 545 }, { "epoch": 1.367473618870267, "grad_norm": 0.6636483073234558, "learning_rate": 2.850512180704715e-05, "loss": 0.0162, "num_input_tokens_seen": 4050320, "step": 550 }, { "epoch": 1.3798882681564246, "grad_norm": 2.2164974212646484, "learning_rate": 2.8182428799868645e-05, "loss": 0.0189, "num_input_tokens_seen": 4088480, "step": 555 }, { "epoch": 1.3923029174425823, "grad_norm": 1.5055993795394897, "learning_rate": 2.7859195912123874e-05, "loss": 0.0331, "num_input_tokens_seen": 4121808, "step": 560 }, { "epoch": 1.40471756672874, "grad_norm": 2.1504476070404053, "learning_rate": 2.7535477978395297e-05, "loss": 0.0323, "num_input_tokens_seen": 4157184, "step": 565 }, { "epoch": 1.4171322160148976, "grad_norm": 1.7122963666915894, "learning_rate": 2.7211329915550615e-05, "loss": 0.0251, "num_input_tokens_seen": 4193696, "step": 570 }, { "epoch": 1.4295468653010552, "grad_norm": 2.430009365081787, "learning_rate": 2.6886806713426434e-05, "loss": 0.0216, "num_input_tokens_seen": 4229296, "step": 575 }, { "epoch": 1.441961514587213, "grad_norm": 0.5678420662879944, "learning_rate": 2.6561963425499574e-05, "loss": 0.0156, "num_input_tokens_seen": 4266064, "step": 580 }, { "epoch": 1.4543761638733705, "grad_norm": 0.7321358323097229, "learning_rate": 2.6236855159547525e-05, "loss": 0.0131, "num_input_tokens_seen": 4303968, "step": 585 }, { "epoch": 1.4667908131595282, "grad_norm": 0.22348253428936005, "learning_rate": 2.5911537068299802e-05, "loss": 0.0268, "num_input_tokens_seen": 4342112, "step": 590 }, { "epoch": 1.4792054624456858, "grad_norm": 2.1055712699890137, "learning_rate": 2.5586064340081516e-05, "loss": 0.0283, "num_input_tokens_seen": 4382192, "step": 595 }, { "epoch": 1.4916201117318435, "grad_norm": 0.6284292340278625, "learning_rate": 2.5260492189451073e-05, "loss": 0.0156, "num_input_tokens_seen": 4417312, "step": 600 }, { "epoch": 1.5040347610180014, "grad_norm": 0.9831182956695557, "learning_rate": 2.4934875847833308e-05, "loss": 0.0259, "num_input_tokens_seen": 4451088, "step": 605 }, { "epoch": 1.5164494103041588, "grad_norm": 2.135622262954712, "learning_rate": 2.460927055414981e-05, "loss": 0.0194, "num_input_tokens_seen": 4483216, "step": 610 }, { "epoch": 1.5288640595903167, "grad_norm": 0.9603184461593628, "learning_rate": 2.428373154544791e-05, "loss": 0.0193, "num_input_tokens_seen": 4522112, "step": 615 }, { "epoch": 1.5412787088764741, "grad_norm": 2.2298717498779297, "learning_rate": 2.3958314047530125e-05, "loss": 0.0274, "num_input_tokens_seen": 4557680, "step": 620 }, { "epoch": 1.553693358162632, "grad_norm": 1.421598196029663, "learning_rate": 2.3633073265585356e-05, "loss": 0.0207, "num_input_tokens_seen": 4598688, "step": 625 }, { "epoch": 1.5661080074487894, "grad_norm": 1.4904828071594238, "learning_rate": 2.330806437482365e-05, "loss": 0.015, "num_input_tokens_seen": 4634416, "step": 630 }, { "epoch": 1.5785226567349473, "grad_norm": 1.2914338111877441, "learning_rate": 2.298334251111607e-05, "loss": 0.0146, "num_input_tokens_seen": 4674752, "step": 635 }, { "epoch": 1.590937306021105, "grad_norm": 1.376469373703003, "learning_rate": 2.2658962761641232e-05, "loss": 0.0101, "num_input_tokens_seen": 4711168, "step": 640 }, { "epoch": 1.6033519553072626, "grad_norm": 3.4528443813323975, "learning_rate": 2.233498015554002e-05, "loss": 0.0254, "num_input_tokens_seen": 4744512, "step": 645 }, { "epoch": 1.6157666045934203, "grad_norm": 1.9136216640472412, "learning_rate": 2.2011449654580266e-05, "loss": 0.0174, "num_input_tokens_seen": 4783280, "step": 650 }, { "epoch": 1.628181253879578, "grad_norm": 0.9676538705825806, "learning_rate": 2.1688426143832803e-05, "loss": 0.0091, "num_input_tokens_seen": 4819312, "step": 655 }, { "epoch": 1.6405959031657356, "grad_norm": 1.1898787021636963, "learning_rate": 2.1365964422360497e-05, "loss": 0.0066, "num_input_tokens_seen": 4858640, "step": 660 }, { "epoch": 1.6530105524518932, "grad_norm": 1.0632059574127197, "learning_rate": 2.104411919392193e-05, "loss": 0.0119, "num_input_tokens_seen": 4898464, "step": 665 }, { "epoch": 1.6654252017380509, "grad_norm": 0.5189204216003418, "learning_rate": 2.0722945057691252e-05, "loss": 0.0125, "num_input_tokens_seen": 4942720, "step": 670 }, { "epoch": 1.6778398510242085, "grad_norm": 1.324246883392334, "learning_rate": 2.0402496498995667e-05, "loss": 0.0105, "num_input_tokens_seen": 4978512, "step": 675 }, { "epoch": 1.6902545003103664, "grad_norm": 1.2258532047271729, "learning_rate": 2.008282788007239e-05, "loss": 0.0085, "num_input_tokens_seen": 5016080, "step": 680 }, { "epoch": 1.7026691495965238, "grad_norm": 3.3063275814056396, "learning_rate": 1.9763993430846395e-05, "loss": 0.0194, "num_input_tokens_seen": 5052112, "step": 685 }, { "epoch": 1.7150837988826817, "grad_norm": 1.9474592208862305, "learning_rate": 1.944604723973058e-05, "loss": 0.0157, "num_input_tokens_seen": 5087808, "step": 690 }, { "epoch": 1.7274984481688391, "grad_norm": 1.0510306358337402, "learning_rate": 1.9129043244450026e-05, "loss": 0.0095, "num_input_tokens_seen": 5118928, "step": 695 }, { "epoch": 1.739913097454997, "grad_norm": 0.6354652643203735, "learning_rate": 1.8813035222891784e-05, "loss": 0.0052, "num_input_tokens_seen": 5151792, "step": 700 } ], "logging_steps": 5, "max_steps": 1206, "num_input_tokens_seen": 5151792, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1918209084861645e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }