{ "best_global_step": 16968, "best_metric": 0.15088850259780884, "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_boolq_123_1762628463/checkpoint-16968", "epoch": 20.0, "eval_steps": 2121, "global_step": 42420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023573785950023575, "grad_norm": 6.023632049560547, "learning_rate": 4.714757190004715e-08, "loss": 0.6057, "num_input_tokens_seen": 5184, "step": 5 }, { "epoch": 0.004714757190004715, "grad_norm": 2.6282451152801514, "learning_rate": 1.0608203677510608e-07, "loss": 0.5862, "num_input_tokens_seen": 9600, "step": 10 }, { "epoch": 0.007072135785007072, "grad_norm": 0.8247906565666199, "learning_rate": 1.6501650165016504e-07, "loss": 0.3688, "num_input_tokens_seen": 14336, "step": 15 }, { "epoch": 0.00942951438000943, "grad_norm": 2.8065528869628906, "learning_rate": 2.2395096652522396e-07, "loss": 0.3121, "num_input_tokens_seen": 19680, "step": 20 }, { "epoch": 0.011786892975011787, "grad_norm": 3.1807472705841064, "learning_rate": 2.828854314002829e-07, "loss": 0.9506, "num_input_tokens_seen": 24576, "step": 25 }, { "epoch": 0.014144271570014143, "grad_norm": 3.406327724456787, "learning_rate": 3.4181989627534184e-07, "loss": 0.342, "num_input_tokens_seen": 29216, "step": 30 }, { "epoch": 0.0165016501650165, "grad_norm": 0.8785660862922668, "learning_rate": 4.007543611504008e-07, "loss": 0.119, "num_input_tokens_seen": 33440, "step": 35 }, { "epoch": 0.01885902876001886, "grad_norm": 3.8353400230407715, "learning_rate": 4.5968882602545973e-07, "loss": 0.5443, "num_input_tokens_seen": 38240, "step": 40 }, { "epoch": 0.021216407355021217, "grad_norm": 2.328681468963623, "learning_rate": 5.186232909005186e-07, "loss": 0.3394, "num_input_tokens_seen": 43296, "step": 45 }, { "epoch": 0.023573785950023574, "grad_norm": 0.7230870127677917, "learning_rate": 5.775577557755775e-07, "loss": 0.1801, "num_input_tokens_seen": 47776, "step": 50 }, { "epoch": 0.02593116454502593, "grad_norm": 3.5831546783447266, "learning_rate": 6.364922206506365e-07, "loss": 0.6421, "num_input_tokens_seen": 52544, "step": 55 }, { "epoch": 0.028288543140028287, "grad_norm": 1.0082216262817383, "learning_rate": 6.954266855256955e-07, "loss": 0.3972, "num_input_tokens_seen": 58176, "step": 60 }, { "epoch": 0.030645921735030647, "grad_norm": 5.761929512023926, "learning_rate": 7.543611504007544e-07, "loss": 0.7584, "num_input_tokens_seen": 62912, "step": 65 }, { "epoch": 0.033003300330033, "grad_norm": 2.6351330280303955, "learning_rate": 8.132956152758133e-07, "loss": 0.6401, "num_input_tokens_seen": 68992, "step": 70 }, { "epoch": 0.03536067892503536, "grad_norm": 2.9537434577941895, "learning_rate": 8.722300801508723e-07, "loss": 0.8128, "num_input_tokens_seen": 73312, "step": 75 }, { "epoch": 0.03771805752003772, "grad_norm": 2.8937716484069824, "learning_rate": 9.311645450259312e-07, "loss": 0.6709, "num_input_tokens_seen": 78976, "step": 80 }, { "epoch": 0.040075436115040074, "grad_norm": 4.1982245445251465, "learning_rate": 9.900990099009902e-07, "loss": 0.4635, "num_input_tokens_seen": 83712, "step": 85 }, { "epoch": 0.042432814710042434, "grad_norm": 3.5949859619140625, "learning_rate": 1.0490334747760492e-06, "loss": 0.3784, "num_input_tokens_seen": 87936, "step": 90 }, { "epoch": 0.04479019330504479, "grad_norm": 0.5521332621574402, "learning_rate": 1.107967939651108e-06, "loss": 0.2236, "num_input_tokens_seen": 92704, "step": 95 }, { "epoch": 0.04714757190004715, "grad_norm": 1.1381092071533203, "learning_rate": 1.166902404526167e-06, "loss": 0.1863, "num_input_tokens_seen": 97664, "step": 100 }, { "epoch": 0.04950495049504951, "grad_norm": 0.886139452457428, "learning_rate": 1.2258368694012257e-06, "loss": 0.1942, "num_input_tokens_seen": 102240, "step": 105 }, { "epoch": 0.05186232909005186, "grad_norm": 4.2133965492248535, "learning_rate": 1.284771334276285e-06, "loss": 0.2198, "num_input_tokens_seen": 107296, "step": 110 }, { "epoch": 0.05421970768505422, "grad_norm": 1.8306502103805542, "learning_rate": 1.343705799151344e-06, "loss": 0.3155, "num_input_tokens_seen": 112992, "step": 115 }, { "epoch": 0.056577086280056574, "grad_norm": 1.4466181993484497, "learning_rate": 1.4026402640264027e-06, "loss": 0.3416, "num_input_tokens_seen": 118464, "step": 120 }, { "epoch": 0.058934464875058934, "grad_norm": 1.096137523651123, "learning_rate": 1.4615747289014617e-06, "loss": 0.4174, "num_input_tokens_seen": 124608, "step": 125 }, { "epoch": 0.061291843470061294, "grad_norm": 2.9522602558135986, "learning_rate": 1.5205091937765205e-06, "loss": 0.4644, "num_input_tokens_seen": 131008, "step": 130 }, { "epoch": 0.06364922206506365, "grad_norm": 1.082009196281433, "learning_rate": 1.5794436586515793e-06, "loss": 0.2333, "num_input_tokens_seen": 136480, "step": 135 }, { "epoch": 0.066006600660066, "grad_norm": 4.063689708709717, "learning_rate": 1.6383781235266383e-06, "loss": 0.6784, "num_input_tokens_seen": 141280, "step": 140 }, { "epoch": 0.06836397925506836, "grad_norm": 2.448277711868286, "learning_rate": 1.6973125884016973e-06, "loss": 0.6606, "num_input_tokens_seen": 146400, "step": 145 }, { "epoch": 0.07072135785007072, "grad_norm": 1.4777203798294067, "learning_rate": 1.7562470532767563e-06, "loss": 0.4081, "num_input_tokens_seen": 151392, "step": 150 }, { "epoch": 0.07307873644507308, "grad_norm": 3.2903647422790527, "learning_rate": 1.8151815181518153e-06, "loss": 0.4494, "num_input_tokens_seen": 156768, "step": 155 }, { "epoch": 0.07543611504007544, "grad_norm": 0.8123233914375305, "learning_rate": 1.874115983026874e-06, "loss": 0.2098, "num_input_tokens_seen": 161216, "step": 160 }, { "epoch": 0.07779349363507779, "grad_norm": 1.5158545970916748, "learning_rate": 1.933050447901933e-06, "loss": 0.3315, "num_input_tokens_seen": 166528, "step": 165 }, { "epoch": 0.08015087223008015, "grad_norm": 4.008279323577881, "learning_rate": 1.991984912776992e-06, "loss": 0.3994, "num_input_tokens_seen": 172064, "step": 170 }, { "epoch": 0.08250825082508251, "grad_norm": 3.761793375015259, "learning_rate": 2.050919377652051e-06, "loss": 0.4331, "num_input_tokens_seen": 177280, "step": 175 }, { "epoch": 0.08486562942008487, "grad_norm": 5.94821834564209, "learning_rate": 2.1098538425271103e-06, "loss": 0.5134, "num_input_tokens_seen": 181440, "step": 180 }, { "epoch": 0.08722300801508723, "grad_norm": 3.609553337097168, "learning_rate": 2.1687883074021686e-06, "loss": 0.4929, "num_input_tokens_seen": 186496, "step": 185 }, { "epoch": 0.08958038661008957, "grad_norm": 2.893794059753418, "learning_rate": 2.227722772277228e-06, "loss": 0.213, "num_input_tokens_seen": 190688, "step": 190 }, { "epoch": 0.09193776520509193, "grad_norm": 0.47728046774864197, "learning_rate": 2.2866572371522866e-06, "loss": 0.3682, "num_input_tokens_seen": 195648, "step": 195 }, { "epoch": 0.0942951438000943, "grad_norm": 5.4001898765563965, "learning_rate": 2.345591702027346e-06, "loss": 0.3624, "num_input_tokens_seen": 200512, "step": 200 }, { "epoch": 0.09665252239509665, "grad_norm": 4.271119117736816, "learning_rate": 2.4045261669024046e-06, "loss": 0.7829, "num_input_tokens_seen": 205856, "step": 205 }, { "epoch": 0.09900990099009901, "grad_norm": 2.856412649154663, "learning_rate": 2.463460631777464e-06, "loss": 0.5748, "num_input_tokens_seen": 210784, "step": 210 }, { "epoch": 0.10136727958510136, "grad_norm": 5.62005090713501, "learning_rate": 2.5223950966525226e-06, "loss": 0.4732, "num_input_tokens_seen": 215360, "step": 215 }, { "epoch": 0.10372465818010372, "grad_norm": 3.191505193710327, "learning_rate": 2.5813295615275814e-06, "loss": 0.7254, "num_input_tokens_seen": 220064, "step": 220 }, { "epoch": 0.10608203677510608, "grad_norm": 4.153656959533691, "learning_rate": 2.6402640264026406e-06, "loss": 0.5303, "num_input_tokens_seen": 224352, "step": 225 }, { "epoch": 0.10843941537010844, "grad_norm": 1.1340796947479248, "learning_rate": 2.6991984912776994e-06, "loss": 0.4151, "num_input_tokens_seen": 229088, "step": 230 }, { "epoch": 0.1107967939651108, "grad_norm": 1.0990736484527588, "learning_rate": 2.7581329561527586e-06, "loss": 0.3585, "num_input_tokens_seen": 233824, "step": 235 }, { "epoch": 0.11315417256011315, "grad_norm": 3.451202630996704, "learning_rate": 2.817067421027817e-06, "loss": 0.6327, "num_input_tokens_seen": 238816, "step": 240 }, { "epoch": 0.11551155115511551, "grad_norm": 4.442129611968994, "learning_rate": 2.876001885902876e-06, "loss": 0.6112, "num_input_tokens_seen": 243328, "step": 245 }, { "epoch": 0.11786892975011787, "grad_norm": 3.454606056213379, "learning_rate": 2.934936350777935e-06, "loss": 0.34, "num_input_tokens_seen": 248896, "step": 250 }, { "epoch": 0.12022630834512023, "grad_norm": 2.782458782196045, "learning_rate": 2.993870815652994e-06, "loss": 0.2976, "num_input_tokens_seen": 253440, "step": 255 }, { "epoch": 0.12258368694012259, "grad_norm": 1.8441119194030762, "learning_rate": 3.052805280528053e-06, "loss": 0.4665, "num_input_tokens_seen": 258208, "step": 260 }, { "epoch": 0.12494106553512493, "grad_norm": 0.8024095296859741, "learning_rate": 3.1117397454031117e-06, "loss": 0.1089, "num_input_tokens_seen": 262592, "step": 265 }, { "epoch": 0.1272984441301273, "grad_norm": 4.145676136016846, "learning_rate": 3.170674210278171e-06, "loss": 0.526, "num_input_tokens_seen": 267648, "step": 270 }, { "epoch": 0.12965582272512965, "grad_norm": 0.9986189007759094, "learning_rate": 3.2296086751532297e-06, "loss": 0.8014, "num_input_tokens_seen": 272384, "step": 275 }, { "epoch": 0.132013201320132, "grad_norm": 0.8228076696395874, "learning_rate": 3.2885431400282885e-06, "loss": 0.3956, "num_input_tokens_seen": 278464, "step": 280 }, { "epoch": 0.13437057991513437, "grad_norm": 2.335969924926758, "learning_rate": 3.3474776049033477e-06, "loss": 0.2679, "num_input_tokens_seen": 282432, "step": 285 }, { "epoch": 0.13672795851013672, "grad_norm": 1.7018508911132812, "learning_rate": 3.4064120697784065e-06, "loss": 0.4537, "num_input_tokens_seen": 287744, "step": 290 }, { "epoch": 0.1390853371051391, "grad_norm": 2.1275150775909424, "learning_rate": 3.4653465346534657e-06, "loss": 0.5024, "num_input_tokens_seen": 291840, "step": 295 }, { "epoch": 0.14144271570014144, "grad_norm": 0.955101728439331, "learning_rate": 3.5242809995285245e-06, "loss": 0.5184, "num_input_tokens_seen": 296928, "step": 300 }, { "epoch": 0.1438000942951438, "grad_norm": 3.054905414581299, "learning_rate": 3.5832154644035833e-06, "loss": 0.3396, "num_input_tokens_seen": 302272, "step": 305 }, { "epoch": 0.14615747289014616, "grad_norm": 4.346579551696777, "learning_rate": 3.6421499292786425e-06, "loss": 0.5679, "num_input_tokens_seen": 306688, "step": 310 }, { "epoch": 0.1485148514851485, "grad_norm": 5.197269916534424, "learning_rate": 3.7010843941537013e-06, "loss": 0.7053, "num_input_tokens_seen": 310528, "step": 315 }, { "epoch": 0.15087223008015088, "grad_norm": 4.9452223777771, "learning_rate": 3.7600188590287605e-06, "loss": 0.4407, "num_input_tokens_seen": 314912, "step": 320 }, { "epoch": 0.15322960867515323, "grad_norm": 0.9456391930580139, "learning_rate": 3.818953323903819e-06, "loss": 0.2513, "num_input_tokens_seen": 319712, "step": 325 }, { "epoch": 0.15558698727015557, "grad_norm": 3.6347923278808594, "learning_rate": 3.877887788778878e-06, "loss": 0.7585, "num_input_tokens_seen": 324704, "step": 330 }, { "epoch": 0.15794436586515795, "grad_norm": 0.9717179536819458, "learning_rate": 3.936822253653937e-06, "loss": 0.4442, "num_input_tokens_seen": 330112, "step": 335 }, { "epoch": 0.1603017444601603, "grad_norm": 5.50918436050415, "learning_rate": 3.995756718528996e-06, "loss": 0.592, "num_input_tokens_seen": 337984, "step": 340 }, { "epoch": 0.16265912305516267, "grad_norm": 0.5864297151565552, "learning_rate": 4.054691183404055e-06, "loss": 0.6602, "num_input_tokens_seen": 343136, "step": 345 }, { "epoch": 0.16501650165016502, "grad_norm": 0.6455245018005371, "learning_rate": 4.113625648279114e-06, "loss": 0.4025, "num_input_tokens_seen": 348992, "step": 350 }, { "epoch": 0.16737388024516736, "grad_norm": 3.512477397918701, "learning_rate": 4.172560113154173e-06, "loss": 0.6274, "num_input_tokens_seen": 353504, "step": 355 }, { "epoch": 0.16973125884016974, "grad_norm": 4.662522792816162, "learning_rate": 4.231494578029232e-06, "loss": 0.4134, "num_input_tokens_seen": 359104, "step": 360 }, { "epoch": 0.17208863743517208, "grad_norm": 0.8516752123832703, "learning_rate": 4.29042904290429e-06, "loss": 0.3022, "num_input_tokens_seen": 363744, "step": 365 }, { "epoch": 0.17444601603017446, "grad_norm": 3.403691530227661, "learning_rate": 4.34936350777935e-06, "loss": 0.6321, "num_input_tokens_seen": 368864, "step": 370 }, { "epoch": 0.1768033946251768, "grad_norm": 2.7946510314941406, "learning_rate": 4.408297972654409e-06, "loss": 0.4408, "num_input_tokens_seen": 373824, "step": 375 }, { "epoch": 0.17916077322017915, "grad_norm": 4.422415733337402, "learning_rate": 4.467232437529468e-06, "loss": 0.4899, "num_input_tokens_seen": 378432, "step": 380 }, { "epoch": 0.18151815181518152, "grad_norm": 1.6771291494369507, "learning_rate": 4.526166902404526e-06, "loss": 0.3939, "num_input_tokens_seen": 383552, "step": 385 }, { "epoch": 0.18387553041018387, "grad_norm": 4.456026077270508, "learning_rate": 4.585101367279585e-06, "loss": 0.5403, "num_input_tokens_seen": 388768, "step": 390 }, { "epoch": 0.18623290900518624, "grad_norm": 0.9074841737747192, "learning_rate": 4.644035832154645e-06, "loss": 0.2605, "num_input_tokens_seen": 393120, "step": 395 }, { "epoch": 0.1885902876001886, "grad_norm": 1.193577766418457, "learning_rate": 4.702970297029704e-06, "loss": 0.2928, "num_input_tokens_seen": 397856, "step": 400 }, { "epoch": 0.19094766619519093, "grad_norm": 4.033779144287109, "learning_rate": 4.7619047619047615e-06, "loss": 0.5416, "num_input_tokens_seen": 402848, "step": 405 }, { "epoch": 0.1933050447901933, "grad_norm": 4.615461349487305, "learning_rate": 4.820839226779821e-06, "loss": 0.4452, "num_input_tokens_seen": 407936, "step": 410 }, { "epoch": 0.19566242338519566, "grad_norm": 4.403698444366455, "learning_rate": 4.87977369165488e-06, "loss": 0.5347, "num_input_tokens_seen": 413024, "step": 415 }, { "epoch": 0.19801980198019803, "grad_norm": 6.370095252990723, "learning_rate": 4.93870815652994e-06, "loss": 0.6823, "num_input_tokens_seen": 418720, "step": 420 }, { "epoch": 0.20037718057520038, "grad_norm": 6.372138023376465, "learning_rate": 4.9976426214049975e-06, "loss": 0.4375, "num_input_tokens_seen": 425440, "step": 425 }, { "epoch": 0.20273455917020272, "grad_norm": 3.53932785987854, "learning_rate": 5.056577086280056e-06, "loss": 0.796, "num_input_tokens_seen": 430464, "step": 430 }, { "epoch": 0.2050919377652051, "grad_norm": 4.669775009155273, "learning_rate": 5.115511551155116e-06, "loss": 0.5042, "num_input_tokens_seen": 435776, "step": 435 }, { "epoch": 0.20744931636020744, "grad_norm": 0.8513526916503906, "learning_rate": 5.174446016030175e-06, "loss": 0.3152, "num_input_tokens_seen": 440352, "step": 440 }, { "epoch": 0.20980669495520982, "grad_norm": 2.3816378116607666, "learning_rate": 5.2333804809052335e-06, "loss": 0.4815, "num_input_tokens_seen": 444768, "step": 445 }, { "epoch": 0.21216407355021216, "grad_norm": 6.771730899810791, "learning_rate": 5.292314945780292e-06, "loss": 0.6835, "num_input_tokens_seen": 450528, "step": 450 }, { "epoch": 0.2145214521452145, "grad_norm": 10.187238693237305, "learning_rate": 5.351249410655351e-06, "loss": 0.969, "num_input_tokens_seen": 454976, "step": 455 }, { "epoch": 0.21687883074021688, "grad_norm": 2.5289974212646484, "learning_rate": 5.410183875530411e-06, "loss": 0.9386, "num_input_tokens_seen": 459072, "step": 460 }, { "epoch": 0.21923620933521923, "grad_norm": 1.4604519605636597, "learning_rate": 5.4691183404054695e-06, "loss": 0.4296, "num_input_tokens_seen": 464320, "step": 465 }, { "epoch": 0.2215935879302216, "grad_norm": 0.6402031183242798, "learning_rate": 5.528052805280528e-06, "loss": 0.2475, "num_input_tokens_seen": 469120, "step": 470 }, { "epoch": 0.22395096652522395, "grad_norm": 3.196258783340454, "learning_rate": 5.586987270155587e-06, "loss": 0.6134, "num_input_tokens_seen": 473920, "step": 475 }, { "epoch": 0.2263083451202263, "grad_norm": 2.958310604095459, "learning_rate": 5.645921735030646e-06, "loss": 0.7439, "num_input_tokens_seen": 477920, "step": 480 }, { "epoch": 0.22866572371522867, "grad_norm": 1.3323396444320679, "learning_rate": 5.7048561999057055e-06, "loss": 0.639, "num_input_tokens_seen": 483040, "step": 485 }, { "epoch": 0.23102310231023102, "grad_norm": 3.1617796421051025, "learning_rate": 5.763790664780764e-06, "loss": 0.8233, "num_input_tokens_seen": 487872, "step": 490 }, { "epoch": 0.2333804809052334, "grad_norm": 1.5926963090896606, "learning_rate": 5.822725129655823e-06, "loss": 0.287, "num_input_tokens_seen": 492960, "step": 495 }, { "epoch": 0.23573785950023574, "grad_norm": 3.5086159706115723, "learning_rate": 5.881659594530882e-06, "loss": 0.1795, "num_input_tokens_seen": 498400, "step": 500 }, { "epoch": 0.23809523809523808, "grad_norm": 1.3558942079544067, "learning_rate": 5.940594059405941e-06, "loss": 0.27, "num_input_tokens_seen": 502784, "step": 505 }, { "epoch": 0.24045261669024046, "grad_norm": 7.574243545532227, "learning_rate": 5.999528524281e-06, "loss": 0.4226, "num_input_tokens_seen": 507776, "step": 510 }, { "epoch": 0.2428099952852428, "grad_norm": 1.0512218475341797, "learning_rate": 6.058462989156059e-06, "loss": 0.2738, "num_input_tokens_seen": 512480, "step": 515 }, { "epoch": 0.24516737388024518, "grad_norm": 1.9688383340835571, "learning_rate": 6.117397454031118e-06, "loss": 0.4753, "num_input_tokens_seen": 517600, "step": 520 }, { "epoch": 0.24752475247524752, "grad_norm": 2.752411365509033, "learning_rate": 6.176331918906177e-06, "loss": 0.5113, "num_input_tokens_seen": 523520, "step": 525 }, { "epoch": 0.24988213107024987, "grad_norm": 1.8146591186523438, "learning_rate": 6.235266383781235e-06, "loss": 0.321, "num_input_tokens_seen": 527584, "step": 530 }, { "epoch": 0.2522395096652522, "grad_norm": 3.301400899887085, "learning_rate": 6.294200848656294e-06, "loss": 0.621, "num_input_tokens_seen": 532736, "step": 535 }, { "epoch": 0.2545968882602546, "grad_norm": 1.3475465774536133, "learning_rate": 6.353135313531354e-06, "loss": 0.6177, "num_input_tokens_seen": 537440, "step": 540 }, { "epoch": 0.25695426685525696, "grad_norm": 1.0077059268951416, "learning_rate": 6.412069778406413e-06, "loss": 0.4219, "num_input_tokens_seen": 543040, "step": 545 }, { "epoch": 0.2593116454502593, "grad_norm": 6.692164421081543, "learning_rate": 6.471004243281471e-06, "loss": 0.7427, "num_input_tokens_seen": 547520, "step": 550 }, { "epoch": 0.26166902404526166, "grad_norm": 2.7395851612091064, "learning_rate": 6.52993870815653e-06, "loss": 0.2734, "num_input_tokens_seen": 552800, "step": 555 }, { "epoch": 0.264026402640264, "grad_norm": 0.9757192730903625, "learning_rate": 6.588873173031589e-06, "loss": 0.097, "num_input_tokens_seen": 557376, "step": 560 }, { "epoch": 0.2663837812352664, "grad_norm": 0.7048436999320984, "learning_rate": 6.647807637906649e-06, "loss": 0.2308, "num_input_tokens_seen": 563776, "step": 565 }, { "epoch": 0.26874115983026875, "grad_norm": 2.6663897037506104, "learning_rate": 6.706742102781707e-06, "loss": 0.5513, "num_input_tokens_seen": 567712, "step": 570 }, { "epoch": 0.2710985384252711, "grad_norm": 1.5665146112442017, "learning_rate": 6.765676567656766e-06, "loss": 0.4007, "num_input_tokens_seen": 572992, "step": 575 }, { "epoch": 0.27345591702027344, "grad_norm": 1.8097643852233887, "learning_rate": 6.824611032531825e-06, "loss": 0.3548, "num_input_tokens_seen": 578816, "step": 580 }, { "epoch": 0.2758132956152758, "grad_norm": 5.303888320922852, "learning_rate": 6.883545497406884e-06, "loss": 0.4592, "num_input_tokens_seen": 583552, "step": 585 }, { "epoch": 0.2781706742102782, "grad_norm": 2.8989832401275635, "learning_rate": 6.942479962281943e-06, "loss": 0.3921, "num_input_tokens_seen": 588864, "step": 590 }, { "epoch": 0.28052805280528054, "grad_norm": 1.3845725059509277, "learning_rate": 7.001414427157002e-06, "loss": 0.294, "num_input_tokens_seen": 593248, "step": 595 }, { "epoch": 0.2828854314002829, "grad_norm": 2.5521039962768555, "learning_rate": 7.060348892032061e-06, "loss": 0.4681, "num_input_tokens_seen": 599360, "step": 600 }, { "epoch": 0.28524280999528523, "grad_norm": 2.554302215576172, "learning_rate": 7.11928335690712e-06, "loss": 0.3951, "num_input_tokens_seen": 604352, "step": 605 }, { "epoch": 0.2876001885902876, "grad_norm": 3.0146210193634033, "learning_rate": 7.178217821782178e-06, "loss": 0.3012, "num_input_tokens_seen": 609952, "step": 610 }, { "epoch": 0.28995756718529, "grad_norm": 1.9889626502990723, "learning_rate": 7.237152286657238e-06, "loss": 0.2812, "num_input_tokens_seen": 614400, "step": 615 }, { "epoch": 0.2923149457802923, "grad_norm": 5.2602643966674805, "learning_rate": 7.296086751532297e-06, "loss": 0.5168, "num_input_tokens_seen": 619552, "step": 620 }, { "epoch": 0.29467232437529467, "grad_norm": 4.058675289154053, "learning_rate": 7.355021216407356e-06, "loss": 0.4996, "num_input_tokens_seen": 624960, "step": 625 }, { "epoch": 0.297029702970297, "grad_norm": 0.9031217694282532, "learning_rate": 7.413955681282414e-06, "loss": 0.5305, "num_input_tokens_seen": 631520, "step": 630 }, { "epoch": 0.29938708156529936, "grad_norm": 0.7623670101165771, "learning_rate": 7.472890146157472e-06, "loss": 0.5969, "num_input_tokens_seen": 637568, "step": 635 }, { "epoch": 0.30174446016030176, "grad_norm": 1.303468108177185, "learning_rate": 7.531824611032533e-06, "loss": 0.3699, "num_input_tokens_seen": 642688, "step": 640 }, { "epoch": 0.3041018387553041, "grad_norm": 0.8503625988960266, "learning_rate": 7.590759075907592e-06, "loss": 0.2283, "num_input_tokens_seen": 647392, "step": 645 }, { "epoch": 0.30645921735030646, "grad_norm": 2.915661096572876, "learning_rate": 7.64969354078265e-06, "loss": 0.4409, "num_input_tokens_seen": 652800, "step": 650 }, { "epoch": 0.3088165959453088, "grad_norm": 3.9335012435913086, "learning_rate": 7.708628005657708e-06, "loss": 0.3304, "num_input_tokens_seen": 659328, "step": 655 }, { "epoch": 0.31117397454031115, "grad_norm": 5.0903096199035645, "learning_rate": 7.767562470532768e-06, "loss": 0.2569, "num_input_tokens_seen": 663488, "step": 660 }, { "epoch": 0.31353135313531355, "grad_norm": 0.42957407236099243, "learning_rate": 7.826496935407828e-06, "loss": 0.1859, "num_input_tokens_seen": 668192, "step": 665 }, { "epoch": 0.3158887317303159, "grad_norm": 4.472097873687744, "learning_rate": 7.885431400282886e-06, "loss": 0.7881, "num_input_tokens_seen": 673568, "step": 670 }, { "epoch": 0.31824611032531824, "grad_norm": 2.449472188949585, "learning_rate": 7.944365865157945e-06, "loss": 0.3943, "num_input_tokens_seen": 681056, "step": 675 }, { "epoch": 0.3206034889203206, "grad_norm": 4.2645721435546875, "learning_rate": 8.003300330033003e-06, "loss": 0.3098, "num_input_tokens_seen": 685920, "step": 680 }, { "epoch": 0.32296086751532294, "grad_norm": 0.5958676338195801, "learning_rate": 8.062234794908063e-06, "loss": 0.3183, "num_input_tokens_seen": 690816, "step": 685 }, { "epoch": 0.32531824611032534, "grad_norm": 4.825544834136963, "learning_rate": 8.121169259783122e-06, "loss": 0.2954, "num_input_tokens_seen": 695648, "step": 690 }, { "epoch": 0.3276756247053277, "grad_norm": 0.7372010350227356, "learning_rate": 8.18010372465818e-06, "loss": 0.0966, "num_input_tokens_seen": 700960, "step": 695 }, { "epoch": 0.33003300330033003, "grad_norm": 0.9890338182449341, "learning_rate": 8.23903818953324e-06, "loss": 0.119, "num_input_tokens_seen": 705856, "step": 700 }, { "epoch": 0.3323903818953324, "grad_norm": 6.173374176025391, "learning_rate": 8.297972654408298e-06, "loss": 0.2717, "num_input_tokens_seen": 710272, "step": 705 }, { "epoch": 0.3347477604903347, "grad_norm": 0.7982988357543945, "learning_rate": 8.356907119283356e-06, "loss": 0.4655, "num_input_tokens_seen": 714816, "step": 710 }, { "epoch": 0.3371051390853371, "grad_norm": 3.656996965408325, "learning_rate": 8.415841584158417e-06, "loss": 1.053, "num_input_tokens_seen": 720736, "step": 715 }, { "epoch": 0.33946251768033947, "grad_norm": 3.0586791038513184, "learning_rate": 8.474776049033475e-06, "loss": 0.2614, "num_input_tokens_seen": 726848, "step": 720 }, { "epoch": 0.3418198962753418, "grad_norm": 2.8888871669769287, "learning_rate": 8.533710513908535e-06, "loss": 0.6954, "num_input_tokens_seen": 732512, "step": 725 }, { "epoch": 0.34417727487034416, "grad_norm": 2.461846113204956, "learning_rate": 8.592644978783593e-06, "loss": 0.3064, "num_input_tokens_seen": 737952, "step": 730 }, { "epoch": 0.3465346534653465, "grad_norm": 0.8807897567749023, "learning_rate": 8.65157944365865e-06, "loss": 0.2503, "num_input_tokens_seen": 742752, "step": 735 }, { "epoch": 0.3488920320603489, "grad_norm": 3.6302378177642822, "learning_rate": 8.710513908533712e-06, "loss": 0.9087, "num_input_tokens_seen": 747968, "step": 740 }, { "epoch": 0.35124941065535126, "grad_norm": 0.2844088673591614, "learning_rate": 8.76944837340877e-06, "loss": 0.4773, "num_input_tokens_seen": 753632, "step": 745 }, { "epoch": 0.3536067892503536, "grad_norm": 0.7725861668586731, "learning_rate": 8.82838283828383e-06, "loss": 0.3417, "num_input_tokens_seen": 759904, "step": 750 }, { "epoch": 0.35596416784535595, "grad_norm": 3.8490662574768066, "learning_rate": 8.887317303158887e-06, "loss": 0.8336, "num_input_tokens_seen": 764960, "step": 755 }, { "epoch": 0.3583215464403583, "grad_norm": 0.40261492133140564, "learning_rate": 8.946251768033945e-06, "loss": 0.3557, "num_input_tokens_seen": 769184, "step": 760 }, { "epoch": 0.3606789250353607, "grad_norm": 2.650007486343384, "learning_rate": 9.005186232909007e-06, "loss": 0.3932, "num_input_tokens_seen": 775104, "step": 765 }, { "epoch": 0.36303630363036304, "grad_norm": 1.966752052307129, "learning_rate": 9.064120697784065e-06, "loss": 0.2737, "num_input_tokens_seen": 779744, "step": 770 }, { "epoch": 0.3653936822253654, "grad_norm": 0.23261098563671112, "learning_rate": 9.123055162659123e-06, "loss": 0.1985, "num_input_tokens_seen": 784832, "step": 775 }, { "epoch": 0.36775106082036774, "grad_norm": 0.997441291809082, "learning_rate": 9.181989627534182e-06, "loss": 0.2983, "num_input_tokens_seen": 789920, "step": 780 }, { "epoch": 0.3701084394153701, "grad_norm": 5.383679389953613, "learning_rate": 9.24092409240924e-06, "loss": 0.1321, "num_input_tokens_seen": 795168, "step": 785 }, { "epoch": 0.3724658180103725, "grad_norm": 0.19777031242847443, "learning_rate": 9.299858557284302e-06, "loss": 0.2687, "num_input_tokens_seen": 800096, "step": 790 }, { "epoch": 0.37482319660537483, "grad_norm": 3.753209352493286, "learning_rate": 9.35879302215936e-06, "loss": 0.4568, "num_input_tokens_seen": 805632, "step": 795 }, { "epoch": 0.3771805752003772, "grad_norm": 2.187546968460083, "learning_rate": 9.417727487034417e-06, "loss": 0.4557, "num_input_tokens_seen": 810688, "step": 800 }, { "epoch": 0.3795379537953795, "grad_norm": 0.4781167507171631, "learning_rate": 9.476661951909477e-06, "loss": 0.4632, "num_input_tokens_seen": 815264, "step": 805 }, { "epoch": 0.38189533239038187, "grad_norm": 0.09818488359451294, "learning_rate": 9.535596416784535e-06, "loss": 0.3664, "num_input_tokens_seen": 820320, "step": 810 }, { "epoch": 0.38425271098538427, "grad_norm": 0.17353084683418274, "learning_rate": 9.594530881659596e-06, "loss": 0.5474, "num_input_tokens_seen": 824064, "step": 815 }, { "epoch": 0.3866100895803866, "grad_norm": 0.6542636752128601, "learning_rate": 9.653465346534654e-06, "loss": 0.1674, "num_input_tokens_seen": 829120, "step": 820 }, { "epoch": 0.38896746817538896, "grad_norm": 2.190786361694336, "learning_rate": 9.712399811409712e-06, "loss": 0.4126, "num_input_tokens_seen": 834816, "step": 825 }, { "epoch": 0.3913248467703913, "grad_norm": 1.8419913053512573, "learning_rate": 9.771334276284772e-06, "loss": 0.2481, "num_input_tokens_seen": 840256, "step": 830 }, { "epoch": 0.39368222536539366, "grad_norm": 2.32023024559021, "learning_rate": 9.83026874115983e-06, "loss": 0.1093, "num_input_tokens_seen": 844864, "step": 835 }, { "epoch": 0.39603960396039606, "grad_norm": 2.2073915004730225, "learning_rate": 9.88920320603489e-06, "loss": 0.2687, "num_input_tokens_seen": 850304, "step": 840 }, { "epoch": 0.3983969825553984, "grad_norm": 1.5857843160629272, "learning_rate": 9.948137670909949e-06, "loss": 0.085, "num_input_tokens_seen": 855232, "step": 845 }, { "epoch": 0.40075436115040075, "grad_norm": 0.21277180314064026, "learning_rate": 1.0007072135785007e-05, "loss": 0.4871, "num_input_tokens_seen": 860128, "step": 850 }, { "epoch": 0.4031117397454031, "grad_norm": 4.8298258781433105, "learning_rate": 1.0066006600660067e-05, "loss": 0.4733, "num_input_tokens_seen": 864896, "step": 855 }, { "epoch": 0.40546911834040544, "grad_norm": 2.2795357704162598, "learning_rate": 1.0124941065535125e-05, "loss": 0.4257, "num_input_tokens_seen": 869408, "step": 860 }, { "epoch": 0.40782649693540785, "grad_norm": 3.71614670753479, "learning_rate": 1.0183875530410184e-05, "loss": 0.2301, "num_input_tokens_seen": 874144, "step": 865 }, { "epoch": 0.4101838755304102, "grad_norm": 1.0762547254562378, "learning_rate": 1.0242809995285244e-05, "loss": 0.5145, "num_input_tokens_seen": 879040, "step": 870 }, { "epoch": 0.41254125412541254, "grad_norm": 0.23501023650169373, "learning_rate": 1.0301744460160302e-05, "loss": 0.3263, "num_input_tokens_seen": 883712, "step": 875 }, { "epoch": 0.4148986327204149, "grad_norm": 0.9386695623397827, "learning_rate": 1.0360678925035361e-05, "loss": 0.3429, "num_input_tokens_seen": 888128, "step": 880 }, { "epoch": 0.41725601131541723, "grad_norm": 0.9013268947601318, "learning_rate": 1.041961338991042e-05, "loss": 0.1917, "num_input_tokens_seen": 894240, "step": 885 }, { "epoch": 0.41961338991041963, "grad_norm": 1.4185774326324463, "learning_rate": 1.0478547854785479e-05, "loss": 0.1971, "num_input_tokens_seen": 899456, "step": 890 }, { "epoch": 0.421970768505422, "grad_norm": 0.3551771640777588, "learning_rate": 1.0537482319660539e-05, "loss": 0.1898, "num_input_tokens_seen": 904672, "step": 895 }, { "epoch": 0.4243281471004243, "grad_norm": 1.4853852987289429, "learning_rate": 1.0596416784535597e-05, "loss": 0.2383, "num_input_tokens_seen": 913216, "step": 900 }, { "epoch": 0.42668552569542667, "grad_norm": 4.0058441162109375, "learning_rate": 1.0655351249410656e-05, "loss": 0.4947, "num_input_tokens_seen": 917664, "step": 905 }, { "epoch": 0.429042904290429, "grad_norm": 4.701272964477539, "learning_rate": 1.0714285714285714e-05, "loss": 0.2127, "num_input_tokens_seen": 923360, "step": 910 }, { "epoch": 0.4314002828854314, "grad_norm": 2.2257115840911865, "learning_rate": 1.0773220179160774e-05, "loss": 0.1959, "num_input_tokens_seen": 928608, "step": 915 }, { "epoch": 0.43375766148043376, "grad_norm": 0.935390830039978, "learning_rate": 1.0832154644035833e-05, "loss": 0.15, "num_input_tokens_seen": 933312, "step": 920 }, { "epoch": 0.4361150400754361, "grad_norm": 0.08392957597970963, "learning_rate": 1.0891089108910891e-05, "loss": 0.3495, "num_input_tokens_seen": 937824, "step": 925 }, { "epoch": 0.43847241867043846, "grad_norm": 3.2670369148254395, "learning_rate": 1.0950023573785951e-05, "loss": 0.4271, "num_input_tokens_seen": 942016, "step": 930 }, { "epoch": 0.4408297972654408, "grad_norm": 0.30096861720085144, "learning_rate": 1.1008958038661009e-05, "loss": 0.3949, "num_input_tokens_seen": 946080, "step": 935 }, { "epoch": 0.4431871758604432, "grad_norm": 0.09650272876024246, "learning_rate": 1.1067892503536068e-05, "loss": 0.0851, "num_input_tokens_seen": 951168, "step": 940 }, { "epoch": 0.44554455445544555, "grad_norm": 1.0181081295013428, "learning_rate": 1.1126826968411128e-05, "loss": 0.2705, "num_input_tokens_seen": 956992, "step": 945 }, { "epoch": 0.4479019330504479, "grad_norm": 0.9703948497772217, "learning_rate": 1.1185761433286186e-05, "loss": 0.1497, "num_input_tokens_seen": 961248, "step": 950 }, { "epoch": 0.45025931164545024, "grad_norm": 2.2186994552612305, "learning_rate": 1.1244695898161246e-05, "loss": 0.3383, "num_input_tokens_seen": 965376, "step": 955 }, { "epoch": 0.4526166902404526, "grad_norm": 2.6064095497131348, "learning_rate": 1.1303630363036304e-05, "loss": 0.4261, "num_input_tokens_seen": 969664, "step": 960 }, { "epoch": 0.454974068835455, "grad_norm": 1.1518181562423706, "learning_rate": 1.1362564827911363e-05, "loss": 0.1665, "num_input_tokens_seen": 974016, "step": 965 }, { "epoch": 0.45733144743045734, "grad_norm": 2.1984262466430664, "learning_rate": 1.1421499292786423e-05, "loss": 0.5085, "num_input_tokens_seen": 979072, "step": 970 }, { "epoch": 0.4596888260254597, "grad_norm": 2.890876531600952, "learning_rate": 1.148043375766148e-05, "loss": 0.2772, "num_input_tokens_seen": 984032, "step": 975 }, { "epoch": 0.46204620462046203, "grad_norm": 2.8101344108581543, "learning_rate": 1.1539368222536539e-05, "loss": 0.0745, "num_input_tokens_seen": 988544, "step": 980 }, { "epoch": 0.4644035832154644, "grad_norm": 8.66305160522461, "learning_rate": 1.1598302687411598e-05, "loss": 0.3092, "num_input_tokens_seen": 993504, "step": 985 }, { "epoch": 0.4667609618104668, "grad_norm": 1.525733470916748, "learning_rate": 1.1657237152286658e-05, "loss": 0.246, "num_input_tokens_seen": 999104, "step": 990 }, { "epoch": 0.4691183404054691, "grad_norm": 0.2251185029745102, "learning_rate": 1.1716171617161718e-05, "loss": 0.2047, "num_input_tokens_seen": 1004864, "step": 995 }, { "epoch": 0.47147571900047147, "grad_norm": 3.34869122505188, "learning_rate": 1.1775106082036776e-05, "loss": 0.1107, "num_input_tokens_seen": 1009472, "step": 1000 }, { "epoch": 0.4738330975954738, "grad_norm": 1.1395840644836426, "learning_rate": 1.1834040546911834e-05, "loss": 0.1808, "num_input_tokens_seen": 1013536, "step": 1005 }, { "epoch": 0.47619047619047616, "grad_norm": 0.7985941767692566, "learning_rate": 1.1892975011786893e-05, "loss": 0.1219, "num_input_tokens_seen": 1018144, "step": 1010 }, { "epoch": 0.47854785478547857, "grad_norm": 0.15081119537353516, "learning_rate": 1.1951909476661953e-05, "loss": 0.4522, "num_input_tokens_seen": 1022848, "step": 1015 }, { "epoch": 0.4809052333804809, "grad_norm": 3.365469217300415, "learning_rate": 1.2010843941537012e-05, "loss": 0.2205, "num_input_tokens_seen": 1027424, "step": 1020 }, { "epoch": 0.48326261197548326, "grad_norm": 2.8321282863616943, "learning_rate": 1.206977840641207e-05, "loss": 0.3865, "num_input_tokens_seen": 1032768, "step": 1025 }, { "epoch": 0.4856199905704856, "grad_norm": 1.922838807106018, "learning_rate": 1.2128712871287128e-05, "loss": 0.0864, "num_input_tokens_seen": 1037440, "step": 1030 }, { "epoch": 0.48797736916548795, "grad_norm": 0.6764984726905823, "learning_rate": 1.2187647336162188e-05, "loss": 0.1539, "num_input_tokens_seen": 1042400, "step": 1035 }, { "epoch": 0.49033474776049035, "grad_norm": 2.9376866817474365, "learning_rate": 1.2246581801037248e-05, "loss": 0.2106, "num_input_tokens_seen": 1048032, "step": 1040 }, { "epoch": 0.4926921263554927, "grad_norm": 2.800457715988159, "learning_rate": 1.2305516265912306e-05, "loss": 0.1902, "num_input_tokens_seen": 1053408, "step": 1045 }, { "epoch": 0.49504950495049505, "grad_norm": 2.1707890033721924, "learning_rate": 1.2364450730787365e-05, "loss": 0.2207, "num_input_tokens_seen": 1059008, "step": 1050 }, { "epoch": 0.4974068835454974, "grad_norm": 1.3683396577835083, "learning_rate": 1.2423385195662423e-05, "loss": 0.2487, "num_input_tokens_seen": 1063712, "step": 1055 }, { "epoch": 0.49976426214049974, "grad_norm": 6.342047214508057, "learning_rate": 1.2482319660537483e-05, "loss": 0.833, "num_input_tokens_seen": 1068320, "step": 1060 }, { "epoch": 0.5021216407355021, "grad_norm": 1.7139934301376343, "learning_rate": 1.254125412541254e-05, "loss": 0.1598, "num_input_tokens_seen": 1073280, "step": 1065 }, { "epoch": 0.5044790193305044, "grad_norm": 1.0844107866287231, "learning_rate": 1.26001885902876e-05, "loss": 0.2317, "num_input_tokens_seen": 1077184, "step": 1070 }, { "epoch": 0.5068363979255068, "grad_norm": 0.049143433570861816, "learning_rate": 1.2659123055162662e-05, "loss": 0.3032, "num_input_tokens_seen": 1081664, "step": 1075 }, { "epoch": 0.5091937765205092, "grad_norm": 0.13430076837539673, "learning_rate": 1.2718057520037718e-05, "loss": 0.2338, "num_input_tokens_seen": 1086304, "step": 1080 }, { "epoch": 0.5115511551155115, "grad_norm": 0.06299097090959549, "learning_rate": 1.277699198491278e-05, "loss": 0.2001, "num_input_tokens_seen": 1091616, "step": 1085 }, { "epoch": 0.5139085337105139, "grad_norm": 0.9955681562423706, "learning_rate": 1.2835926449787835e-05, "loss": 0.1441, "num_input_tokens_seen": 1095616, "step": 1090 }, { "epoch": 0.5162659123055162, "grad_norm": 2.952972173690796, "learning_rate": 1.2894860914662895e-05, "loss": 0.5524, "num_input_tokens_seen": 1101632, "step": 1095 }, { "epoch": 0.5186232909005186, "grad_norm": 0.9646040797233582, "learning_rate": 1.2953795379537956e-05, "loss": 0.2054, "num_input_tokens_seen": 1106080, "step": 1100 }, { "epoch": 0.520980669495521, "grad_norm": 0.1739448457956314, "learning_rate": 1.3012729844413013e-05, "loss": 0.1222, "num_input_tokens_seen": 1111456, "step": 1105 }, { "epoch": 0.5233380480905233, "grad_norm": 0.2683197259902954, "learning_rate": 1.3071664309288072e-05, "loss": 0.1466, "num_input_tokens_seen": 1117632, "step": 1110 }, { "epoch": 0.5256954266855257, "grad_norm": 0.668407142162323, "learning_rate": 1.313059877416313e-05, "loss": 0.1511, "num_input_tokens_seen": 1122496, "step": 1115 }, { "epoch": 0.528052805280528, "grad_norm": 0.5189939737319946, "learning_rate": 1.318953323903819e-05, "loss": 0.1124, "num_input_tokens_seen": 1126176, "step": 1120 }, { "epoch": 0.5304101838755304, "grad_norm": 0.05785933509469032, "learning_rate": 1.3248467703913251e-05, "loss": 0.0938, "num_input_tokens_seen": 1133024, "step": 1125 }, { "epoch": 0.5327675624705328, "grad_norm": 0.8678096532821655, "learning_rate": 1.3307402168788307e-05, "loss": 0.0714, "num_input_tokens_seen": 1138112, "step": 1130 }, { "epoch": 0.5351249410655351, "grad_norm": 1.2218955755233765, "learning_rate": 1.3366336633663367e-05, "loss": 0.0988, "num_input_tokens_seen": 1144064, "step": 1135 }, { "epoch": 0.5374823196605375, "grad_norm": 1.0834227800369263, "learning_rate": 1.3425271098538425e-05, "loss": 0.1838, "num_input_tokens_seen": 1148992, "step": 1140 }, { "epoch": 0.5398396982555398, "grad_norm": 0.217332124710083, "learning_rate": 1.3484205563413485e-05, "loss": 0.1091, "num_input_tokens_seen": 1154592, "step": 1145 }, { "epoch": 0.5421970768505422, "grad_norm": 1.3912875652313232, "learning_rate": 1.3543140028288546e-05, "loss": 0.1669, "num_input_tokens_seen": 1159936, "step": 1150 }, { "epoch": 0.5445544554455446, "grad_norm": 0.37762194871902466, "learning_rate": 1.3602074493163602e-05, "loss": 0.2084, "num_input_tokens_seen": 1164448, "step": 1155 }, { "epoch": 0.5469118340405469, "grad_norm": 0.5035673975944519, "learning_rate": 1.3661008958038662e-05, "loss": 0.1912, "num_input_tokens_seen": 1168544, "step": 1160 }, { "epoch": 0.5492692126355493, "grad_norm": 3.70770001411438, "learning_rate": 1.371994342291372e-05, "loss": 0.1213, "num_input_tokens_seen": 1172960, "step": 1165 }, { "epoch": 0.5516265912305516, "grad_norm": 3.7580552101135254, "learning_rate": 1.377887788778878e-05, "loss": 0.1908, "num_input_tokens_seen": 1178144, "step": 1170 }, { "epoch": 0.553983969825554, "grad_norm": 4.487481594085693, "learning_rate": 1.3837812352663839e-05, "loss": 0.3895, "num_input_tokens_seen": 1184448, "step": 1175 }, { "epoch": 0.5563413484205564, "grad_norm": 1.7969030141830444, "learning_rate": 1.3896746817538897e-05, "loss": 0.3661, "num_input_tokens_seen": 1189472, "step": 1180 }, { "epoch": 0.5586987270155587, "grad_norm": 0.9583998322486877, "learning_rate": 1.3955681282413957e-05, "loss": 0.1873, "num_input_tokens_seen": 1195072, "step": 1185 }, { "epoch": 0.5610561056105611, "grad_norm": 1.132076382637024, "learning_rate": 1.4014615747289015e-05, "loss": 0.334, "num_input_tokens_seen": 1198656, "step": 1190 }, { "epoch": 0.5634134842055634, "grad_norm": 0.8229724168777466, "learning_rate": 1.4073550212164074e-05, "loss": 0.1862, "num_input_tokens_seen": 1203616, "step": 1195 }, { "epoch": 0.5657708628005658, "grad_norm": 0.657558262348175, "learning_rate": 1.4132484677039134e-05, "loss": 0.2472, "num_input_tokens_seen": 1208224, "step": 1200 }, { "epoch": 0.5681282413955682, "grad_norm": 0.4521310329437256, "learning_rate": 1.4191419141914192e-05, "loss": 0.3519, "num_input_tokens_seen": 1212608, "step": 1205 }, { "epoch": 0.5704856199905705, "grad_norm": 0.40830516815185547, "learning_rate": 1.4250353606789251e-05, "loss": 0.1084, "num_input_tokens_seen": 1218336, "step": 1210 }, { "epoch": 0.5728429985855729, "grad_norm": 0.21356698870658875, "learning_rate": 1.430928807166431e-05, "loss": 0.2362, "num_input_tokens_seen": 1223296, "step": 1215 }, { "epoch": 0.5752003771805752, "grad_norm": 1.0396902561187744, "learning_rate": 1.4368222536539369e-05, "loss": 0.2409, "num_input_tokens_seen": 1228448, "step": 1220 }, { "epoch": 0.5775577557755776, "grad_norm": 4.946902275085449, "learning_rate": 1.4427157001414429e-05, "loss": 0.5625, "num_input_tokens_seen": 1233024, "step": 1225 }, { "epoch": 0.57991513437058, "grad_norm": 4.382996559143066, "learning_rate": 1.4486091466289486e-05, "loss": 0.4444, "num_input_tokens_seen": 1238080, "step": 1230 }, { "epoch": 0.5822725129655822, "grad_norm": 0.2869453728199005, "learning_rate": 1.4545025931164546e-05, "loss": 0.1018, "num_input_tokens_seen": 1243296, "step": 1235 }, { "epoch": 0.5846298915605846, "grad_norm": 1.3394907712936401, "learning_rate": 1.4603960396039604e-05, "loss": 0.3606, "num_input_tokens_seen": 1248128, "step": 1240 }, { "epoch": 0.5869872701555869, "grad_norm": 2.1797034740448, "learning_rate": 1.4662894860914664e-05, "loss": 0.0974, "num_input_tokens_seen": 1252864, "step": 1245 }, { "epoch": 0.5893446487505893, "grad_norm": 3.4482765197753906, "learning_rate": 1.4721829325789723e-05, "loss": 0.4528, "num_input_tokens_seen": 1257856, "step": 1250 }, { "epoch": 0.5917020273455917, "grad_norm": 0.18477359414100647, "learning_rate": 1.4780763790664781e-05, "loss": 0.2641, "num_input_tokens_seen": 1262240, "step": 1255 }, { "epoch": 0.594059405940594, "grad_norm": 0.1672021895647049, "learning_rate": 1.4839698255539841e-05, "loss": 0.2806, "num_input_tokens_seen": 1267392, "step": 1260 }, { "epoch": 0.5964167845355964, "grad_norm": 2.326645851135254, "learning_rate": 1.4898632720414899e-05, "loss": 0.1672, "num_input_tokens_seen": 1272064, "step": 1265 }, { "epoch": 0.5987741631305987, "grad_norm": 1.5258152484893799, "learning_rate": 1.4957567185289958e-05, "loss": 0.1886, "num_input_tokens_seen": 1275968, "step": 1270 }, { "epoch": 0.6011315417256011, "grad_norm": 3.1828689575195312, "learning_rate": 1.5016501650165018e-05, "loss": 0.1855, "num_input_tokens_seen": 1280096, "step": 1275 }, { "epoch": 0.6034889203206035, "grad_norm": 1.1235460042953491, "learning_rate": 1.5075436115040076e-05, "loss": 0.2385, "num_input_tokens_seen": 1284704, "step": 1280 }, { "epoch": 0.6058462989156058, "grad_norm": 2.908484697341919, "learning_rate": 1.5134370579915136e-05, "loss": 0.1119, "num_input_tokens_seen": 1289312, "step": 1285 }, { "epoch": 0.6082036775106082, "grad_norm": 0.19229216873645782, "learning_rate": 1.5193305044790194e-05, "loss": 0.6526, "num_input_tokens_seen": 1293440, "step": 1290 }, { "epoch": 0.6105610561056105, "grad_norm": 3.675973653793335, "learning_rate": 1.5252239509665253e-05, "loss": 0.2308, "num_input_tokens_seen": 1297408, "step": 1295 }, { "epoch": 0.6129184347006129, "grad_norm": 2.295207977294922, "learning_rate": 1.5311173974540313e-05, "loss": 0.215, "num_input_tokens_seen": 1302304, "step": 1300 }, { "epoch": 0.6152758132956153, "grad_norm": 4.734398365020752, "learning_rate": 1.537010843941537e-05, "loss": 0.393, "num_input_tokens_seen": 1306368, "step": 1305 }, { "epoch": 0.6176331918906176, "grad_norm": 1.836348295211792, "learning_rate": 1.5429042904290432e-05, "loss": 0.2529, "num_input_tokens_seen": 1311712, "step": 1310 }, { "epoch": 0.61999057048562, "grad_norm": 0.6863689422607422, "learning_rate": 1.548797736916549e-05, "loss": 0.2564, "num_input_tokens_seen": 1320000, "step": 1315 }, { "epoch": 0.6223479490806223, "grad_norm": 2.7056217193603516, "learning_rate": 1.5546911834040548e-05, "loss": 0.3685, "num_input_tokens_seen": 1324448, "step": 1320 }, { "epoch": 0.6247053276756247, "grad_norm": 3.3472747802734375, "learning_rate": 1.5605846298915608e-05, "loss": 0.4194, "num_input_tokens_seen": 1329088, "step": 1325 }, { "epoch": 0.6270627062706271, "grad_norm": 1.9857957363128662, "learning_rate": 1.5664780763790664e-05, "loss": 0.2135, "num_input_tokens_seen": 1333920, "step": 1330 }, { "epoch": 0.6294200848656294, "grad_norm": 1.4459338188171387, "learning_rate": 1.5723715228665727e-05, "loss": 0.1912, "num_input_tokens_seen": 1338848, "step": 1335 }, { "epoch": 0.6317774634606318, "grad_norm": 3.5225374698638916, "learning_rate": 1.5782649693540783e-05, "loss": 0.3977, "num_input_tokens_seen": 1344864, "step": 1340 }, { "epoch": 0.6341348420556341, "grad_norm": 1.4261821508407593, "learning_rate": 1.5841584158415843e-05, "loss": 0.2709, "num_input_tokens_seen": 1350144, "step": 1345 }, { "epoch": 0.6364922206506365, "grad_norm": 3.335082530975342, "learning_rate": 1.5900518623290902e-05, "loss": 0.2211, "num_input_tokens_seen": 1354592, "step": 1350 }, { "epoch": 0.6388495992456389, "grad_norm": 3.8331167697906494, "learning_rate": 1.595945308816596e-05, "loss": 0.3181, "num_input_tokens_seen": 1358368, "step": 1355 }, { "epoch": 0.6412069778406412, "grad_norm": 1.1917749643325806, "learning_rate": 1.6018387553041018e-05, "loss": 0.1759, "num_input_tokens_seen": 1362848, "step": 1360 }, { "epoch": 0.6435643564356436, "grad_norm": 6.434321403503418, "learning_rate": 1.6077322017916078e-05, "loss": 0.2985, "num_input_tokens_seen": 1367456, "step": 1365 }, { "epoch": 0.6459217350306459, "grad_norm": 1.5444170236587524, "learning_rate": 1.6136256482791138e-05, "loss": 0.3404, "num_input_tokens_seen": 1373888, "step": 1370 }, { "epoch": 0.6482791136256483, "grad_norm": 0.4692562222480774, "learning_rate": 1.6195190947666197e-05, "loss": 0.3731, "num_input_tokens_seen": 1378944, "step": 1375 }, { "epoch": 0.6506364922206507, "grad_norm": 3.5406644344329834, "learning_rate": 1.6254125412541253e-05, "loss": 0.5236, "num_input_tokens_seen": 1383328, "step": 1380 }, { "epoch": 0.652993870815653, "grad_norm": 1.5345708131790161, "learning_rate": 1.6313059877416313e-05, "loss": 0.2276, "num_input_tokens_seen": 1388928, "step": 1385 }, { "epoch": 0.6553512494106554, "grad_norm": 0.6138258576393127, "learning_rate": 1.6371994342291373e-05, "loss": 0.269, "num_input_tokens_seen": 1395712, "step": 1390 }, { "epoch": 0.6577086280056577, "grad_norm": 2.2194571495056152, "learning_rate": 1.6430928807166432e-05, "loss": 0.4696, "num_input_tokens_seen": 1399936, "step": 1395 }, { "epoch": 0.6600660066006601, "grad_norm": 2.0250089168548584, "learning_rate": 1.6489863272041492e-05, "loss": 0.2222, "num_input_tokens_seen": 1404992, "step": 1400 }, { "epoch": 0.6624233851956625, "grad_norm": 3.0570125579833984, "learning_rate": 1.6548797736916548e-05, "loss": 0.4071, "num_input_tokens_seen": 1409568, "step": 1405 }, { "epoch": 0.6647807637906648, "grad_norm": 0.22041410207748413, "learning_rate": 1.6607732201791608e-05, "loss": 0.2935, "num_input_tokens_seen": 1415072, "step": 1410 }, { "epoch": 0.6671381423856672, "grad_norm": 1.3728951215744019, "learning_rate": 1.6666666666666667e-05, "loss": 0.1782, "num_input_tokens_seen": 1419552, "step": 1415 }, { "epoch": 0.6694955209806694, "grad_norm": 0.36589497327804565, "learning_rate": 1.6725601131541727e-05, "loss": 0.0904, "num_input_tokens_seen": 1424544, "step": 1420 }, { "epoch": 0.6718528995756718, "grad_norm": 0.5003448724746704, "learning_rate": 1.6784535596416787e-05, "loss": 0.4594, "num_input_tokens_seen": 1429504, "step": 1425 }, { "epoch": 0.6742102781706742, "grad_norm": 1.2839634418487549, "learning_rate": 1.6843470061291843e-05, "loss": 0.2362, "num_input_tokens_seen": 1434336, "step": 1430 }, { "epoch": 0.6765676567656765, "grad_norm": 0.057398684322834015, "learning_rate": 1.6902404526166903e-05, "loss": 0.0993, "num_input_tokens_seen": 1438496, "step": 1435 }, { "epoch": 0.6789250353606789, "grad_norm": 3.311586380004883, "learning_rate": 1.6961338991041962e-05, "loss": 0.322, "num_input_tokens_seen": 1444032, "step": 1440 }, { "epoch": 0.6812824139556812, "grad_norm": 3.0213282108306885, "learning_rate": 1.7020273455917022e-05, "loss": 0.1563, "num_input_tokens_seen": 1448864, "step": 1445 }, { "epoch": 0.6836397925506836, "grad_norm": 4.799317359924316, "learning_rate": 1.707920792079208e-05, "loss": 0.2006, "num_input_tokens_seen": 1453376, "step": 1450 }, { "epoch": 0.685997171145686, "grad_norm": 2.6102521419525146, "learning_rate": 1.7138142385667138e-05, "loss": 0.1758, "num_input_tokens_seen": 1457376, "step": 1455 }, { "epoch": 0.6883545497406883, "grad_norm": 1.8587781190872192, "learning_rate": 1.7197076850542197e-05, "loss": 0.2783, "num_input_tokens_seen": 1462848, "step": 1460 }, { "epoch": 0.6907119283356907, "grad_norm": 0.03791127726435661, "learning_rate": 1.7256011315417257e-05, "loss": 0.3199, "num_input_tokens_seen": 1467872, "step": 1465 }, { "epoch": 0.693069306930693, "grad_norm": 0.6485022902488708, "learning_rate": 1.7314945780292317e-05, "loss": 0.0979, "num_input_tokens_seen": 1472096, "step": 1470 }, { "epoch": 0.6954266855256954, "grad_norm": 1.3679031133651733, "learning_rate": 1.7373880245167376e-05, "loss": 0.281, "num_input_tokens_seen": 1477632, "step": 1475 }, { "epoch": 0.6977840641206978, "grad_norm": 1.9213603734970093, "learning_rate": 1.7432814710042433e-05, "loss": 0.2936, "num_input_tokens_seen": 1482944, "step": 1480 }, { "epoch": 0.7001414427157001, "grad_norm": 0.5510355830192566, "learning_rate": 1.7491749174917492e-05, "loss": 0.0579, "num_input_tokens_seen": 1486976, "step": 1485 }, { "epoch": 0.7024988213107025, "grad_norm": 3.3362808227539062, "learning_rate": 1.7550683639792552e-05, "loss": 0.2443, "num_input_tokens_seen": 1491168, "step": 1490 }, { "epoch": 0.7048561999057048, "grad_norm": 4.282752990722656, "learning_rate": 1.760961810466761e-05, "loss": 0.3282, "num_input_tokens_seen": 1496000, "step": 1495 }, { "epoch": 0.7072135785007072, "grad_norm": 4.001471996307373, "learning_rate": 1.766855256954267e-05, "loss": 0.1869, "num_input_tokens_seen": 1500064, "step": 1500 }, { "epoch": 0.7095709570957096, "grad_norm": 0.25225138664245605, "learning_rate": 1.7727487034417727e-05, "loss": 0.0588, "num_input_tokens_seen": 1505120, "step": 1505 }, { "epoch": 0.7119283356907119, "grad_norm": 1.1901196241378784, "learning_rate": 1.7786421499292787e-05, "loss": 0.3558, "num_input_tokens_seen": 1510240, "step": 1510 }, { "epoch": 0.7142857142857143, "grad_norm": 1.1525615453720093, "learning_rate": 1.7845355964167847e-05, "loss": 0.2256, "num_input_tokens_seen": 1515712, "step": 1515 }, { "epoch": 0.7166430928807166, "grad_norm": 0.7816784381866455, "learning_rate": 1.7904290429042906e-05, "loss": 0.0456, "num_input_tokens_seen": 1520512, "step": 1520 }, { "epoch": 0.719000471475719, "grad_norm": 3.9090487957000732, "learning_rate": 1.7963224893917966e-05, "loss": 0.2285, "num_input_tokens_seen": 1524288, "step": 1525 }, { "epoch": 0.7213578500707214, "grad_norm": 0.5825834274291992, "learning_rate": 1.8022159358793022e-05, "loss": 0.1156, "num_input_tokens_seen": 1528896, "step": 1530 }, { "epoch": 0.7237152286657237, "grad_norm": 0.922500729560852, "learning_rate": 1.8081093823668082e-05, "loss": 0.1196, "num_input_tokens_seen": 1534656, "step": 1535 }, { "epoch": 0.7260726072607261, "grad_norm": 4.962819576263428, "learning_rate": 1.814002828854314e-05, "loss": 0.3813, "num_input_tokens_seen": 1538848, "step": 1540 }, { "epoch": 0.7284299858557284, "grad_norm": 3.39953875541687, "learning_rate": 1.81989627534182e-05, "loss": 0.3749, "num_input_tokens_seen": 1543712, "step": 1545 }, { "epoch": 0.7307873644507308, "grad_norm": 4.408369541168213, "learning_rate": 1.825789721829326e-05, "loss": 0.4939, "num_input_tokens_seen": 1549152, "step": 1550 }, { "epoch": 0.7331447430457332, "grad_norm": 1.133914589881897, "learning_rate": 1.8316831683168317e-05, "loss": 0.0758, "num_input_tokens_seen": 1554080, "step": 1555 }, { "epoch": 0.7355021216407355, "grad_norm": 0.08585638552904129, "learning_rate": 1.8375766148043376e-05, "loss": 0.0781, "num_input_tokens_seen": 1559104, "step": 1560 }, { "epoch": 0.7378595002357379, "grad_norm": 0.7613687515258789, "learning_rate": 1.8434700612918436e-05, "loss": 0.1049, "num_input_tokens_seen": 1565152, "step": 1565 }, { "epoch": 0.7402168788307402, "grad_norm": 0.2143021821975708, "learning_rate": 1.8493635077793496e-05, "loss": 0.3828, "num_input_tokens_seen": 1570688, "step": 1570 }, { "epoch": 0.7425742574257426, "grad_norm": 3.294071912765503, "learning_rate": 1.8552569542668555e-05, "loss": 0.3156, "num_input_tokens_seen": 1576256, "step": 1575 }, { "epoch": 0.744931636020745, "grad_norm": 1.132987380027771, "learning_rate": 1.861150400754361e-05, "loss": 0.4078, "num_input_tokens_seen": 1581056, "step": 1580 }, { "epoch": 0.7472890146157473, "grad_norm": 2.625394582748413, "learning_rate": 1.867043847241867e-05, "loss": 0.165, "num_input_tokens_seen": 1586464, "step": 1585 }, { "epoch": 0.7496463932107497, "grad_norm": 0.7499021291732788, "learning_rate": 1.872937293729373e-05, "loss": 0.3333, "num_input_tokens_seen": 1591296, "step": 1590 }, { "epoch": 0.752003771805752, "grad_norm": 2.139491081237793, "learning_rate": 1.878830740216879e-05, "loss": 0.2179, "num_input_tokens_seen": 1595488, "step": 1595 }, { "epoch": 0.7543611504007544, "grad_norm": 0.3875408172607422, "learning_rate": 1.884724186704385e-05, "loss": 0.1738, "num_input_tokens_seen": 1601312, "step": 1600 }, { "epoch": 0.7567185289957568, "grad_norm": 2.3065764904022217, "learning_rate": 1.8906176331918906e-05, "loss": 0.3845, "num_input_tokens_seen": 1606336, "step": 1605 }, { "epoch": 0.759075907590759, "grad_norm": 0.19686675071716309, "learning_rate": 1.8965110796793966e-05, "loss": 0.0949, "num_input_tokens_seen": 1611008, "step": 1610 }, { "epoch": 0.7614332861857614, "grad_norm": 0.1573130041360855, "learning_rate": 1.9024045261669026e-05, "loss": 0.1648, "num_input_tokens_seen": 1615328, "step": 1615 }, { "epoch": 0.7637906647807637, "grad_norm": 3.3103699684143066, "learning_rate": 1.9082979726544085e-05, "loss": 0.3604, "num_input_tokens_seen": 1620320, "step": 1620 }, { "epoch": 0.7661480433757661, "grad_norm": 2.5614073276519775, "learning_rate": 1.9141914191419145e-05, "loss": 0.263, "num_input_tokens_seen": 1625344, "step": 1625 }, { "epoch": 0.7685054219707685, "grad_norm": 1.8382521867752075, "learning_rate": 1.92008486562942e-05, "loss": 0.1152, "num_input_tokens_seen": 1630624, "step": 1630 }, { "epoch": 0.7708628005657708, "grad_norm": 2.2548599243164062, "learning_rate": 1.925978312116926e-05, "loss": 0.3091, "num_input_tokens_seen": 1635136, "step": 1635 }, { "epoch": 0.7732201791607732, "grad_norm": 0.9114040732383728, "learning_rate": 1.9318717586044317e-05, "loss": 0.3795, "num_input_tokens_seen": 1640640, "step": 1640 }, { "epoch": 0.7755775577557755, "grad_norm": 1.615908145904541, "learning_rate": 1.937765205091938e-05, "loss": 0.1843, "num_input_tokens_seen": 1647040, "step": 1645 }, { "epoch": 0.7779349363507779, "grad_norm": 1.2994569540023804, "learning_rate": 1.943658651579444e-05, "loss": 0.4343, "num_input_tokens_seen": 1651712, "step": 1650 }, { "epoch": 0.7802923149457803, "grad_norm": 1.3969974517822266, "learning_rate": 1.9495520980669496e-05, "loss": 0.0954, "num_input_tokens_seen": 1657536, "step": 1655 }, { "epoch": 0.7826496935407826, "grad_norm": 0.42156437039375305, "learning_rate": 1.9554455445544556e-05, "loss": 0.3075, "num_input_tokens_seen": 1663616, "step": 1660 }, { "epoch": 0.785007072135785, "grad_norm": 5.52536153793335, "learning_rate": 1.9613389910419612e-05, "loss": 0.5526, "num_input_tokens_seen": 1669600, "step": 1665 }, { "epoch": 0.7873644507307873, "grad_norm": 2.023230791091919, "learning_rate": 1.9672324375294675e-05, "loss": 0.2544, "num_input_tokens_seen": 1674560, "step": 1670 }, { "epoch": 0.7897218293257897, "grad_norm": 1.2472273111343384, "learning_rate": 1.9731258840169734e-05, "loss": 0.2861, "num_input_tokens_seen": 1679616, "step": 1675 }, { "epoch": 0.7920792079207921, "grad_norm": 0.4286731481552124, "learning_rate": 1.979019330504479e-05, "loss": 0.0816, "num_input_tokens_seen": 1684160, "step": 1680 }, { "epoch": 0.7944365865157944, "grad_norm": 4.015572547912598, "learning_rate": 1.984912776991985e-05, "loss": 0.4813, "num_input_tokens_seen": 1689280, "step": 1685 }, { "epoch": 0.7967939651107968, "grad_norm": 0.4605790972709656, "learning_rate": 1.9908062234794907e-05, "loss": 0.1395, "num_input_tokens_seen": 1694368, "step": 1690 }, { "epoch": 0.7991513437057991, "grad_norm": 1.5609149932861328, "learning_rate": 1.996699669966997e-05, "loss": 0.2475, "num_input_tokens_seen": 1699072, "step": 1695 }, { "epoch": 0.8015087223008015, "grad_norm": 1.3907461166381836, "learning_rate": 2.002593116454503e-05, "loss": 0.1527, "num_input_tokens_seen": 1704352, "step": 1700 }, { "epoch": 0.8038661008958039, "grad_norm": 3.0022904872894287, "learning_rate": 2.0084865629420085e-05, "loss": 0.4713, "num_input_tokens_seen": 1709088, "step": 1705 }, { "epoch": 0.8062234794908062, "grad_norm": 1.161099910736084, "learning_rate": 2.0143800094295145e-05, "loss": 0.1139, "num_input_tokens_seen": 1713600, "step": 1710 }, { "epoch": 0.8085808580858086, "grad_norm": 0.6843261122703552, "learning_rate": 2.02027345591702e-05, "loss": 0.2536, "num_input_tokens_seen": 1720224, "step": 1715 }, { "epoch": 0.8109382366808109, "grad_norm": 1.7049630880355835, "learning_rate": 2.0261669024045264e-05, "loss": 0.3335, "num_input_tokens_seen": 1725280, "step": 1720 }, { "epoch": 0.8132956152758133, "grad_norm": 1.3455904722213745, "learning_rate": 2.0320603488920324e-05, "loss": 0.2141, "num_input_tokens_seen": 1731904, "step": 1725 }, { "epoch": 0.8156529938708157, "grad_norm": 1.344369888305664, "learning_rate": 2.037953795379538e-05, "loss": 0.1601, "num_input_tokens_seen": 1737056, "step": 1730 }, { "epoch": 0.818010372465818, "grad_norm": 0.6241254210472107, "learning_rate": 2.043847241867044e-05, "loss": 0.1023, "num_input_tokens_seen": 1741568, "step": 1735 }, { "epoch": 0.8203677510608204, "grad_norm": 2.0057213306427, "learning_rate": 2.0497406883545496e-05, "loss": 0.3312, "num_input_tokens_seen": 1745984, "step": 1740 }, { "epoch": 0.8227251296558227, "grad_norm": 1.9785776138305664, "learning_rate": 2.055634134842056e-05, "loss": 0.3677, "num_input_tokens_seen": 1750688, "step": 1745 }, { "epoch": 0.8250825082508251, "grad_norm": 1.036481261253357, "learning_rate": 2.061527581329562e-05, "loss": 0.3251, "num_input_tokens_seen": 1755648, "step": 1750 }, { "epoch": 0.8274398868458275, "grad_norm": 1.5836551189422607, "learning_rate": 2.0674210278170675e-05, "loss": 0.499, "num_input_tokens_seen": 1761408, "step": 1755 }, { "epoch": 0.8297972654408298, "grad_norm": 0.09046540409326553, "learning_rate": 2.0733144743045735e-05, "loss": 0.2298, "num_input_tokens_seen": 1765984, "step": 1760 }, { "epoch": 0.8321546440358322, "grad_norm": 0.34454333782196045, "learning_rate": 2.079207920792079e-05, "loss": 0.1537, "num_input_tokens_seen": 1771040, "step": 1765 }, { "epoch": 0.8345120226308345, "grad_norm": 1.0819438695907593, "learning_rate": 2.085101367279585e-05, "loss": 0.2474, "num_input_tokens_seen": 1776288, "step": 1770 }, { "epoch": 0.8368694012258369, "grad_norm": 3.927859306335449, "learning_rate": 2.0909948137670914e-05, "loss": 0.2176, "num_input_tokens_seen": 1781632, "step": 1775 }, { "epoch": 0.8392267798208393, "grad_norm": 1.384766697883606, "learning_rate": 2.096888260254597e-05, "loss": 0.1939, "num_input_tokens_seen": 1786048, "step": 1780 }, { "epoch": 0.8415841584158416, "grad_norm": 0.652260959148407, "learning_rate": 2.102781706742103e-05, "loss": 0.2347, "num_input_tokens_seen": 1790848, "step": 1785 }, { "epoch": 0.843941537010844, "grad_norm": 1.1316109895706177, "learning_rate": 2.1086751532296086e-05, "loss": 0.0876, "num_input_tokens_seen": 1795680, "step": 1790 }, { "epoch": 0.8462989156058462, "grad_norm": 3.772827386856079, "learning_rate": 2.1145685997171145e-05, "loss": 0.2951, "num_input_tokens_seen": 1800448, "step": 1795 }, { "epoch": 0.8486562942008486, "grad_norm": 1.026903748512268, "learning_rate": 2.120462046204621e-05, "loss": 0.2409, "num_input_tokens_seen": 1805568, "step": 1800 }, { "epoch": 0.851013672795851, "grad_norm": 2.166411876678467, "learning_rate": 2.1263554926921265e-05, "loss": 0.264, "num_input_tokens_seen": 1810080, "step": 1805 }, { "epoch": 0.8533710513908533, "grad_norm": 1.0781108140945435, "learning_rate": 2.1322489391796324e-05, "loss": 0.2488, "num_input_tokens_seen": 1814400, "step": 1810 }, { "epoch": 0.8557284299858557, "grad_norm": 0.6876959204673767, "learning_rate": 2.138142385667138e-05, "loss": 0.6154, "num_input_tokens_seen": 1819296, "step": 1815 }, { "epoch": 0.858085808580858, "grad_norm": 1.065621256828308, "learning_rate": 2.144035832154644e-05, "loss": 0.1989, "num_input_tokens_seen": 1824160, "step": 1820 }, { "epoch": 0.8604431871758604, "grad_norm": 2.272641181945801, "learning_rate": 2.1499292786421503e-05, "loss": 0.5156, "num_input_tokens_seen": 1830240, "step": 1825 }, { "epoch": 0.8628005657708628, "grad_norm": 0.044948965311050415, "learning_rate": 2.155822725129656e-05, "loss": 0.1872, "num_input_tokens_seen": 1834752, "step": 1830 }, { "epoch": 0.8651579443658651, "grad_norm": 0.9231863021850586, "learning_rate": 2.161716171617162e-05, "loss": 0.4061, "num_input_tokens_seen": 1839936, "step": 1835 }, { "epoch": 0.8675153229608675, "grad_norm": 0.2501983046531677, "learning_rate": 2.1676096181046675e-05, "loss": 0.0982, "num_input_tokens_seen": 1844448, "step": 1840 }, { "epoch": 0.8698727015558698, "grad_norm": 0.5401450395584106, "learning_rate": 2.1735030645921735e-05, "loss": 0.1379, "num_input_tokens_seen": 1848704, "step": 1845 }, { "epoch": 0.8722300801508722, "grad_norm": 0.06654199957847595, "learning_rate": 2.1793965110796798e-05, "loss": 0.0753, "num_input_tokens_seen": 1854144, "step": 1850 }, { "epoch": 0.8745874587458746, "grad_norm": 2.4736573696136475, "learning_rate": 2.1852899575671854e-05, "loss": 0.1825, "num_input_tokens_seen": 1859232, "step": 1855 }, { "epoch": 0.8769448373408769, "grad_norm": 0.8052133321762085, "learning_rate": 2.1911834040546914e-05, "loss": 0.0802, "num_input_tokens_seen": 1864736, "step": 1860 }, { "epoch": 0.8793022159358793, "grad_norm": 0.9270957708358765, "learning_rate": 2.197076850542197e-05, "loss": 0.1101, "num_input_tokens_seen": 1870336, "step": 1865 }, { "epoch": 0.8816595945308816, "grad_norm": 0.6477622389793396, "learning_rate": 2.202970297029703e-05, "loss": 0.0761, "num_input_tokens_seen": 1875872, "step": 1870 }, { "epoch": 0.884016973125884, "grad_norm": 1.2417981624603271, "learning_rate": 2.208863743517209e-05, "loss": 0.1346, "num_input_tokens_seen": 1880288, "step": 1875 }, { "epoch": 0.8863743517208864, "grad_norm": 5.403919696807861, "learning_rate": 2.214757190004715e-05, "loss": 0.4116, "num_input_tokens_seen": 1885376, "step": 1880 }, { "epoch": 0.8887317303158887, "grad_norm": 1.2358206510543823, "learning_rate": 2.220650636492221e-05, "loss": 0.1925, "num_input_tokens_seen": 1890272, "step": 1885 }, { "epoch": 0.8910891089108911, "grad_norm": 1.2298117876052856, "learning_rate": 2.2265440829797265e-05, "loss": 0.0604, "num_input_tokens_seen": 1894976, "step": 1890 }, { "epoch": 0.8934464875058934, "grad_norm": 1.5526108741760254, "learning_rate": 2.2324375294672324e-05, "loss": 0.1662, "num_input_tokens_seen": 1900992, "step": 1895 }, { "epoch": 0.8958038661008958, "grad_norm": 1.05245041847229, "learning_rate": 2.2383309759547384e-05, "loss": 0.3383, "num_input_tokens_seen": 1905984, "step": 1900 }, { "epoch": 0.8981612446958982, "grad_norm": 14.301915168762207, "learning_rate": 2.2442244224422444e-05, "loss": 0.1294, "num_input_tokens_seen": 1910848, "step": 1905 }, { "epoch": 0.9005186232909005, "grad_norm": 0.2402954250574112, "learning_rate": 2.2501178689297503e-05, "loss": 0.1594, "num_input_tokens_seen": 1916448, "step": 1910 }, { "epoch": 0.9028760018859029, "grad_norm": 1.3843051195144653, "learning_rate": 2.256011315417256e-05, "loss": 0.1016, "num_input_tokens_seen": 1921440, "step": 1915 }, { "epoch": 0.9052333804809052, "grad_norm": 0.1875217705965042, "learning_rate": 2.261904761904762e-05, "loss": 0.0926, "num_input_tokens_seen": 1925984, "step": 1920 }, { "epoch": 0.9075907590759076, "grad_norm": 1.8122098445892334, "learning_rate": 2.267798208392268e-05, "loss": 0.3353, "num_input_tokens_seen": 1932096, "step": 1925 }, { "epoch": 0.90994813767091, "grad_norm": 3.306304693222046, "learning_rate": 2.273691654879774e-05, "loss": 0.3191, "num_input_tokens_seen": 1936640, "step": 1930 }, { "epoch": 0.9123055162659123, "grad_norm": 3.1293704509735107, "learning_rate": 2.2795851013672798e-05, "loss": 0.3887, "num_input_tokens_seen": 1941472, "step": 1935 }, { "epoch": 0.9146628948609147, "grad_norm": 1.2246793508529663, "learning_rate": 2.2854785478547854e-05, "loss": 0.5009, "num_input_tokens_seen": 1946528, "step": 1940 }, { "epoch": 0.917020273455917, "grad_norm": 0.09886981546878815, "learning_rate": 2.2913719943422914e-05, "loss": 0.1029, "num_input_tokens_seen": 1952352, "step": 1945 }, { "epoch": 0.9193776520509194, "grad_norm": 1.4581795930862427, "learning_rate": 2.2972654408297974e-05, "loss": 0.5685, "num_input_tokens_seen": 1957344, "step": 1950 }, { "epoch": 0.9217350306459218, "grad_norm": 0.12830416858196259, "learning_rate": 2.3031588873173033e-05, "loss": 0.1706, "num_input_tokens_seen": 1961824, "step": 1955 }, { "epoch": 0.9240924092409241, "grad_norm": 0.4006863236427307, "learning_rate": 2.3090523338048093e-05, "loss": 0.2204, "num_input_tokens_seen": 1966368, "step": 1960 }, { "epoch": 0.9264497878359265, "grad_norm": 1.078019142150879, "learning_rate": 2.314945780292315e-05, "loss": 0.1322, "num_input_tokens_seen": 1973600, "step": 1965 }, { "epoch": 0.9288071664309288, "grad_norm": 1.1136996746063232, "learning_rate": 2.320839226779821e-05, "loss": 0.1577, "num_input_tokens_seen": 1978880, "step": 1970 }, { "epoch": 0.9311645450259312, "grad_norm": 0.40913453698158264, "learning_rate": 2.326732673267327e-05, "loss": 0.2507, "num_input_tokens_seen": 1984864, "step": 1975 }, { "epoch": 0.9335219236209336, "grad_norm": 0.7071713209152222, "learning_rate": 2.3326261197548328e-05, "loss": 0.2153, "num_input_tokens_seen": 1990592, "step": 1980 }, { "epoch": 0.9358793022159358, "grad_norm": 2.7038421630859375, "learning_rate": 2.3385195662423388e-05, "loss": 0.1532, "num_input_tokens_seen": 1996512, "step": 1985 }, { "epoch": 0.9382366808109383, "grad_norm": 0.44943171739578247, "learning_rate": 2.3444130127298444e-05, "loss": 0.0845, "num_input_tokens_seen": 2000960, "step": 1990 }, { "epoch": 0.9405940594059405, "grad_norm": 0.8735500574111938, "learning_rate": 2.3503064592173503e-05, "loss": 0.0906, "num_input_tokens_seen": 2005056, "step": 1995 }, { "epoch": 0.9429514380009429, "grad_norm": 0.4211135506629944, "learning_rate": 2.3561999057048563e-05, "loss": 0.1802, "num_input_tokens_seen": 2009472, "step": 2000 }, { "epoch": 0.9453088165959453, "grad_norm": 3.7732372283935547, "learning_rate": 2.3620933521923623e-05, "loss": 0.615, "num_input_tokens_seen": 2013888, "step": 2005 }, { "epoch": 0.9476661951909476, "grad_norm": 0.8833938837051392, "learning_rate": 2.3679867986798682e-05, "loss": 0.2034, "num_input_tokens_seen": 2019808, "step": 2010 }, { "epoch": 0.95002357378595, "grad_norm": 0.20489932596683502, "learning_rate": 2.373880245167374e-05, "loss": 0.2061, "num_input_tokens_seen": 2025440, "step": 2015 }, { "epoch": 0.9523809523809523, "grad_norm": 1.6803643703460693, "learning_rate": 2.3797736916548798e-05, "loss": 0.2288, "num_input_tokens_seen": 2030464, "step": 2020 }, { "epoch": 0.9547383309759547, "grad_norm": 0.6970669627189636, "learning_rate": 2.3856671381423858e-05, "loss": 0.2549, "num_input_tokens_seen": 2035328, "step": 2025 }, { "epoch": 0.9570957095709571, "grad_norm": 1.3964884281158447, "learning_rate": 2.3915605846298918e-05, "loss": 0.1494, "num_input_tokens_seen": 2039616, "step": 2030 }, { "epoch": 0.9594530881659594, "grad_norm": 0.260877400636673, "learning_rate": 2.3974540311173977e-05, "loss": 0.2502, "num_input_tokens_seen": 2045408, "step": 2035 }, { "epoch": 0.9618104667609618, "grad_norm": 0.6182429790496826, "learning_rate": 2.4033474776049033e-05, "loss": 0.1787, "num_input_tokens_seen": 2050816, "step": 2040 }, { "epoch": 0.9641678453559641, "grad_norm": 0.9584265351295471, "learning_rate": 2.4092409240924093e-05, "loss": 0.1263, "num_input_tokens_seen": 2056128, "step": 2045 }, { "epoch": 0.9665252239509665, "grad_norm": 1.3301329612731934, "learning_rate": 2.4151343705799153e-05, "loss": 0.2049, "num_input_tokens_seen": 2061120, "step": 2050 }, { "epoch": 0.9688826025459689, "grad_norm": 0.5444257259368896, "learning_rate": 2.4210278170674212e-05, "loss": 0.3396, "num_input_tokens_seen": 2066400, "step": 2055 }, { "epoch": 0.9712399811409712, "grad_norm": 1.0941691398620605, "learning_rate": 2.4269212635549272e-05, "loss": 0.2418, "num_input_tokens_seen": 2071200, "step": 2060 }, { "epoch": 0.9735973597359736, "grad_norm": 1.3299826383590698, "learning_rate": 2.4328147100424328e-05, "loss": 0.2156, "num_input_tokens_seen": 2076320, "step": 2065 }, { "epoch": 0.9759547383309759, "grad_norm": 2.964113235473633, "learning_rate": 2.4387081565299388e-05, "loss": 0.8193, "num_input_tokens_seen": 2082016, "step": 2070 }, { "epoch": 0.9783121169259783, "grad_norm": 0.9140360355377197, "learning_rate": 2.4446016030174447e-05, "loss": 0.1686, "num_input_tokens_seen": 2087136, "step": 2075 }, { "epoch": 0.9806694955209807, "grad_norm": 1.3949781656265259, "learning_rate": 2.4504950495049507e-05, "loss": 0.5385, "num_input_tokens_seen": 2092352, "step": 2080 }, { "epoch": 0.983026874115983, "grad_norm": 1.2460376024246216, "learning_rate": 2.4563884959924567e-05, "loss": 0.0883, "num_input_tokens_seen": 2096576, "step": 2085 }, { "epoch": 0.9853842527109854, "grad_norm": 1.3474409580230713, "learning_rate": 2.4622819424799623e-05, "loss": 0.1547, "num_input_tokens_seen": 2101472, "step": 2090 }, { "epoch": 0.9877416313059877, "grad_norm": 2.1528844833374023, "learning_rate": 2.4681753889674683e-05, "loss": 0.5854, "num_input_tokens_seen": 2106144, "step": 2095 }, { "epoch": 0.9900990099009901, "grad_norm": 4.016702175140381, "learning_rate": 2.4740688354549742e-05, "loss": 0.4764, "num_input_tokens_seen": 2110880, "step": 2100 }, { "epoch": 0.9924563884959925, "grad_norm": 2.8344123363494873, "learning_rate": 2.4799622819424802e-05, "loss": 0.1748, "num_input_tokens_seen": 2115712, "step": 2105 }, { "epoch": 0.9948137670909948, "grad_norm": 0.06676863133907318, "learning_rate": 2.485855728429986e-05, "loss": 0.1007, "num_input_tokens_seen": 2120192, "step": 2110 }, { "epoch": 0.9971711456859972, "grad_norm": 2.010216474533081, "learning_rate": 2.4917491749174918e-05, "loss": 0.1933, "num_input_tokens_seen": 2125728, "step": 2115 }, { "epoch": 0.9995285242809995, "grad_norm": 0.573483407497406, "learning_rate": 2.4976426214049977e-05, "loss": 0.2441, "num_input_tokens_seen": 2130624, "step": 2120 }, { "epoch": 1.0, "eval_loss": 0.21642036736011505, "eval_runtime": 15.1415, "eval_samples_per_second": 62.279, "eval_steps_per_second": 15.586, "num_input_tokens_seen": 2131904, "step": 2121 }, { "epoch": 1.0018859028760019, "grad_norm": 0.014563685283064842, "learning_rate": 2.5035360678925034e-05, "loss": 0.1318, "num_input_tokens_seen": 2136160, "step": 2125 }, { "epoch": 1.0042432814710043, "grad_norm": 0.41324383020401, "learning_rate": 2.5094295143800097e-05, "loss": 0.1356, "num_input_tokens_seen": 2140320, "step": 2130 }, { "epoch": 1.0066006600660067, "grad_norm": 2.7130541801452637, "learning_rate": 2.5153229608675156e-05, "loss": 0.3517, "num_input_tokens_seen": 2145952, "step": 2135 }, { "epoch": 1.0089580386610089, "grad_norm": 0.34411728382110596, "learning_rate": 2.5212164073550216e-05, "loss": 0.2544, "num_input_tokens_seen": 2151936, "step": 2140 }, { "epoch": 1.0113154172560113, "grad_norm": 0.31951960921287537, "learning_rate": 2.527109853842527e-05, "loss": 0.0446, "num_input_tokens_seen": 2157824, "step": 2145 }, { "epoch": 1.0136727958510137, "grad_norm": 0.8022886514663696, "learning_rate": 2.533003300330033e-05, "loss": 0.131, "num_input_tokens_seen": 2163744, "step": 2150 }, { "epoch": 1.016030174446016, "grad_norm": 0.377561092376709, "learning_rate": 2.538896746817539e-05, "loss": 0.1303, "num_input_tokens_seen": 2168608, "step": 2155 }, { "epoch": 1.0183875530410185, "grad_norm": 1.708582878112793, "learning_rate": 2.544790193305045e-05, "loss": 0.1154, "num_input_tokens_seen": 2173312, "step": 2160 }, { "epoch": 1.0207449316360206, "grad_norm": 1.2108759880065918, "learning_rate": 2.550683639792551e-05, "loss": 0.3404, "num_input_tokens_seen": 2179008, "step": 2165 }, { "epoch": 1.023102310231023, "grad_norm": 1.5471992492675781, "learning_rate": 2.5565770862800564e-05, "loss": 0.3197, "num_input_tokens_seen": 2184320, "step": 2170 }, { "epoch": 1.0254596888260255, "grad_norm": 0.38487452268600464, "learning_rate": 2.5624705327675623e-05, "loss": 0.3234, "num_input_tokens_seen": 2188928, "step": 2175 }, { "epoch": 1.0278170674210279, "grad_norm": 1.6308656930923462, "learning_rate": 2.5683639792550683e-05, "loss": 0.117, "num_input_tokens_seen": 2194016, "step": 2180 }, { "epoch": 1.0301744460160303, "grad_norm": 1.2465039491653442, "learning_rate": 2.5742574257425746e-05, "loss": 0.3228, "num_input_tokens_seen": 2197632, "step": 2185 }, { "epoch": 1.0325318246110324, "grad_norm": 1.0985430479049683, "learning_rate": 2.5801508722300805e-05, "loss": 0.1585, "num_input_tokens_seen": 2203328, "step": 2190 }, { "epoch": 1.0348892032060348, "grad_norm": 0.9291914701461792, "learning_rate": 2.5860443187175858e-05, "loss": 0.1932, "num_input_tokens_seen": 2207424, "step": 2195 }, { "epoch": 1.0372465818010372, "grad_norm": 1.6135057210922241, "learning_rate": 2.5919377652050918e-05, "loss": 0.2348, "num_input_tokens_seen": 2211616, "step": 2200 }, { "epoch": 1.0396039603960396, "grad_norm": 1.8505334854125977, "learning_rate": 2.5978312116925978e-05, "loss": 0.2061, "num_input_tokens_seen": 2215776, "step": 2205 }, { "epoch": 1.041961338991042, "grad_norm": 0.5142789483070374, "learning_rate": 2.603724658180104e-05, "loss": 0.3564, "num_input_tokens_seen": 2220576, "step": 2210 }, { "epoch": 1.0443187175860442, "grad_norm": 0.38521337509155273, "learning_rate": 2.60961810466761e-05, "loss": 0.444, "num_input_tokens_seen": 2225024, "step": 2215 }, { "epoch": 1.0466760961810466, "grad_norm": 0.188017800450325, "learning_rate": 2.6155115511551153e-05, "loss": 0.228, "num_input_tokens_seen": 2229888, "step": 2220 }, { "epoch": 1.049033474776049, "grad_norm": 1.3210804462432861, "learning_rate": 2.6214049976426213e-05, "loss": 0.3389, "num_input_tokens_seen": 2234624, "step": 2225 }, { "epoch": 1.0513908533710514, "grad_norm": 0.21132072806358337, "learning_rate": 2.6272984441301272e-05, "loss": 0.0992, "num_input_tokens_seen": 2238656, "step": 2230 }, { "epoch": 1.0537482319660538, "grad_norm": 0.8836784958839417, "learning_rate": 2.6331918906176335e-05, "loss": 0.0696, "num_input_tokens_seen": 2243552, "step": 2235 }, { "epoch": 1.056105610561056, "grad_norm": 4.442370414733887, "learning_rate": 2.6390853371051395e-05, "loss": 0.1282, "num_input_tokens_seen": 2249056, "step": 2240 }, { "epoch": 1.0584629891560584, "grad_norm": 1.3421839475631714, "learning_rate": 2.6449787835926448e-05, "loss": 0.3155, "num_input_tokens_seen": 2254080, "step": 2245 }, { "epoch": 1.0608203677510608, "grad_norm": 0.4467129707336426, "learning_rate": 2.6508722300801507e-05, "loss": 0.1008, "num_input_tokens_seen": 2259232, "step": 2250 }, { "epoch": 1.0631777463460632, "grad_norm": 4.004823207855225, "learning_rate": 2.6567656765676567e-05, "loss": 0.1357, "num_input_tokens_seen": 2263680, "step": 2255 }, { "epoch": 1.0655351249410656, "grad_norm": 0.9712507128715515, "learning_rate": 2.662659123055163e-05, "loss": 0.2266, "num_input_tokens_seen": 2267904, "step": 2260 }, { "epoch": 1.0678925035360678, "grad_norm": 0.22948333621025085, "learning_rate": 2.668552569542669e-05, "loss": 0.1956, "num_input_tokens_seen": 2272992, "step": 2265 }, { "epoch": 1.0702498821310702, "grad_norm": 0.5122042894363403, "learning_rate": 2.6744460160301743e-05, "loss": 0.3177, "num_input_tokens_seen": 2276864, "step": 2270 }, { "epoch": 1.0726072607260726, "grad_norm": 0.2782951295375824, "learning_rate": 2.6803394625176802e-05, "loss": 0.3144, "num_input_tokens_seen": 2282688, "step": 2275 }, { "epoch": 1.074964639321075, "grad_norm": 1.4053559303283691, "learning_rate": 2.6862329090051862e-05, "loss": 0.4353, "num_input_tokens_seen": 2286720, "step": 2280 }, { "epoch": 1.0773220179160774, "grad_norm": 0.46263977885246277, "learning_rate": 2.6921263554926925e-05, "loss": 0.1366, "num_input_tokens_seen": 2291488, "step": 2285 }, { "epoch": 1.0796793965110796, "grad_norm": 3.9761526584625244, "learning_rate": 2.6980198019801985e-05, "loss": 0.2038, "num_input_tokens_seen": 2296736, "step": 2290 }, { "epoch": 1.082036775106082, "grad_norm": 3.0469539165496826, "learning_rate": 2.7039132484677037e-05, "loss": 0.3165, "num_input_tokens_seen": 2301184, "step": 2295 }, { "epoch": 1.0843941537010844, "grad_norm": 2.093052625656128, "learning_rate": 2.7098066949552097e-05, "loss": 0.2067, "num_input_tokens_seen": 2305536, "step": 2300 }, { "epoch": 1.0867515322960868, "grad_norm": 0.20103374123573303, "learning_rate": 2.7157001414427157e-05, "loss": 0.1162, "num_input_tokens_seen": 2311584, "step": 2305 }, { "epoch": 1.0891089108910892, "grad_norm": 1.1227384805679321, "learning_rate": 2.7215935879302216e-05, "loss": 0.1575, "num_input_tokens_seen": 2316192, "step": 2310 }, { "epoch": 1.0914662894860914, "grad_norm": 1.4277511835098267, "learning_rate": 2.727487034417728e-05, "loss": 0.2278, "num_input_tokens_seen": 2320256, "step": 2315 }, { "epoch": 1.0938236680810938, "grad_norm": 0.24578335881233215, "learning_rate": 2.7333804809052332e-05, "loss": 0.2527, "num_input_tokens_seen": 2324384, "step": 2320 }, { "epoch": 1.0961810466760962, "grad_norm": 2.213207483291626, "learning_rate": 2.7392739273927392e-05, "loss": 0.1136, "num_input_tokens_seen": 2328960, "step": 2325 }, { "epoch": 1.0985384252710986, "grad_norm": 2.090031385421753, "learning_rate": 2.745167373880245e-05, "loss": 0.2339, "num_input_tokens_seen": 2334368, "step": 2330 }, { "epoch": 1.100895803866101, "grad_norm": 1.1255806684494019, "learning_rate": 2.751060820367751e-05, "loss": 0.1549, "num_input_tokens_seen": 2339008, "step": 2335 }, { "epoch": 1.1032531824611032, "grad_norm": 2.1871211528778076, "learning_rate": 2.7569542668552574e-05, "loss": 0.2015, "num_input_tokens_seen": 2343392, "step": 2340 }, { "epoch": 1.1056105610561056, "grad_norm": 2.4945051670074463, "learning_rate": 2.7628477133427627e-05, "loss": 0.2196, "num_input_tokens_seen": 2349184, "step": 2345 }, { "epoch": 1.107967939651108, "grad_norm": 0.10182258486747742, "learning_rate": 2.7687411598302687e-05, "loss": 0.181, "num_input_tokens_seen": 2353024, "step": 2350 }, { "epoch": 1.1103253182461104, "grad_norm": 0.7863506078720093, "learning_rate": 2.7746346063177746e-05, "loss": 0.3456, "num_input_tokens_seen": 2357856, "step": 2355 }, { "epoch": 1.1126826968411128, "grad_norm": 2.122943639755249, "learning_rate": 2.7805280528052806e-05, "loss": 0.2579, "num_input_tokens_seen": 2366080, "step": 2360 }, { "epoch": 1.115040075436115, "grad_norm": 2.596285581588745, "learning_rate": 2.786421499292787e-05, "loss": 0.5999, "num_input_tokens_seen": 2371424, "step": 2365 }, { "epoch": 1.1173974540311173, "grad_norm": 0.7093688249588013, "learning_rate": 2.792314945780292e-05, "loss": 0.1537, "num_input_tokens_seen": 2378112, "step": 2370 }, { "epoch": 1.1197548326261197, "grad_norm": 0.3301173150539398, "learning_rate": 2.798208392267798e-05, "loss": 0.1186, "num_input_tokens_seen": 2383040, "step": 2375 }, { "epoch": 1.1221122112211221, "grad_norm": 2.1163272857666016, "learning_rate": 2.804101838755304e-05, "loss": 0.1502, "num_input_tokens_seen": 2387360, "step": 2380 }, { "epoch": 1.1244695898161245, "grad_norm": 2.4239253997802734, "learning_rate": 2.80999528524281e-05, "loss": 0.2728, "num_input_tokens_seen": 2392576, "step": 2385 }, { "epoch": 1.1268269684111267, "grad_norm": 0.11068591475486755, "learning_rate": 2.8158887317303164e-05, "loss": 0.1072, "num_input_tokens_seen": 2397760, "step": 2390 }, { "epoch": 1.1291843470061291, "grad_norm": 1.1558191776275635, "learning_rate": 2.8217821782178216e-05, "loss": 0.1299, "num_input_tokens_seen": 2401792, "step": 2395 }, { "epoch": 1.1315417256011315, "grad_norm": 2.150131940841675, "learning_rate": 2.8276756247053276e-05, "loss": 0.2337, "num_input_tokens_seen": 2407680, "step": 2400 }, { "epoch": 1.133899104196134, "grad_norm": 0.284222275018692, "learning_rate": 2.8335690711928336e-05, "loss": 0.1614, "num_input_tokens_seen": 2412512, "step": 2405 }, { "epoch": 1.1362564827911363, "grad_norm": 0.6931996941566467, "learning_rate": 2.8394625176803395e-05, "loss": 0.2188, "num_input_tokens_seen": 2417696, "step": 2410 }, { "epoch": 1.1386138613861387, "grad_norm": 3.4733896255493164, "learning_rate": 2.8453559641678455e-05, "loss": 0.2638, "num_input_tokens_seen": 2422304, "step": 2415 }, { "epoch": 1.140971239981141, "grad_norm": 0.7826411128044128, "learning_rate": 2.851249410655351e-05, "loss": 0.0477, "num_input_tokens_seen": 2427552, "step": 2420 }, { "epoch": 1.1433286185761433, "grad_norm": 0.26599815487861633, "learning_rate": 2.857142857142857e-05, "loss": 0.0658, "num_input_tokens_seen": 2432544, "step": 2425 }, { "epoch": 1.1456859971711457, "grad_norm": 2.0714528560638428, "learning_rate": 2.863036303630363e-05, "loss": 0.3126, "num_input_tokens_seen": 2437248, "step": 2430 }, { "epoch": 1.1480433757661481, "grad_norm": 0.7676981091499329, "learning_rate": 2.868929750117869e-05, "loss": 0.116, "num_input_tokens_seen": 2442912, "step": 2435 }, { "epoch": 1.1504007543611503, "grad_norm": 2.0037882328033447, "learning_rate": 2.874823196605375e-05, "loss": 0.4469, "num_input_tokens_seen": 2449184, "step": 2440 }, { "epoch": 1.1527581329561527, "grad_norm": 1.833971619606018, "learning_rate": 2.8807166430928806e-05, "loss": 0.6151, "num_input_tokens_seen": 2455008, "step": 2445 }, { "epoch": 1.155115511551155, "grad_norm": 2.175929307937622, "learning_rate": 2.8866100895803866e-05, "loss": 0.2803, "num_input_tokens_seen": 2459872, "step": 2450 }, { "epoch": 1.1574728901461575, "grad_norm": 0.9034672379493713, "learning_rate": 2.8925035360678925e-05, "loss": 0.2926, "num_input_tokens_seen": 2466016, "step": 2455 }, { "epoch": 1.15983026874116, "grad_norm": 1.6076717376708984, "learning_rate": 2.8983969825553985e-05, "loss": 0.153, "num_input_tokens_seen": 2471584, "step": 2460 }, { "epoch": 1.1621876473361623, "grad_norm": 0.2600574195384979, "learning_rate": 2.9042904290429045e-05, "loss": 0.2378, "num_input_tokens_seen": 2478400, "step": 2465 }, { "epoch": 1.1645450259311645, "grad_norm": 1.846156358718872, "learning_rate": 2.91018387553041e-05, "loss": 0.1393, "num_input_tokens_seen": 2482752, "step": 2470 }, { "epoch": 1.166902404526167, "grad_norm": 2.273141860961914, "learning_rate": 2.916077322017916e-05, "loss": 0.2291, "num_input_tokens_seen": 2487584, "step": 2475 }, { "epoch": 1.1692597831211693, "grad_norm": 0.7034480571746826, "learning_rate": 2.921970768505422e-05, "loss": 0.3368, "num_input_tokens_seen": 2491872, "step": 2480 }, { "epoch": 1.1716171617161717, "grad_norm": 1.7976726293563843, "learning_rate": 2.927864214992928e-05, "loss": 0.143, "num_input_tokens_seen": 2497056, "step": 2485 }, { "epoch": 1.1739745403111739, "grad_norm": 0.9459912180900574, "learning_rate": 2.933757661480434e-05, "loss": 0.2634, "num_input_tokens_seen": 2502880, "step": 2490 }, { "epoch": 1.1763319189061763, "grad_norm": 0.6726357340812683, "learning_rate": 2.9396511079679396e-05, "loss": 0.1662, "num_input_tokens_seen": 2508480, "step": 2495 }, { "epoch": 1.1786892975011787, "grad_norm": 1.3234554529190063, "learning_rate": 2.9455445544554455e-05, "loss": 0.1313, "num_input_tokens_seen": 2512960, "step": 2500 }, { "epoch": 1.181046676096181, "grad_norm": 0.48799145221710205, "learning_rate": 2.9514380009429515e-05, "loss": 0.2914, "num_input_tokens_seen": 2519456, "step": 2505 }, { "epoch": 1.1834040546911835, "grad_norm": 0.3096521496772766, "learning_rate": 2.9573314474304574e-05, "loss": 0.1418, "num_input_tokens_seen": 2525216, "step": 2510 }, { "epoch": 1.1857614332861859, "grad_norm": 1.732455849647522, "learning_rate": 2.9632248939179634e-05, "loss": 0.2274, "num_input_tokens_seen": 2530240, "step": 2515 }, { "epoch": 1.188118811881188, "grad_norm": 0.2721810042858124, "learning_rate": 2.969118340405469e-05, "loss": 0.1312, "num_input_tokens_seen": 2535552, "step": 2520 }, { "epoch": 1.1904761904761905, "grad_norm": 0.9128817319869995, "learning_rate": 2.975011786892975e-05, "loss": 0.1527, "num_input_tokens_seen": 2540704, "step": 2525 }, { "epoch": 1.1928335690711929, "grad_norm": 0.2058170586824417, "learning_rate": 2.980905233380481e-05, "loss": 0.1019, "num_input_tokens_seen": 2545376, "step": 2530 }, { "epoch": 1.1951909476661953, "grad_norm": 3.0450503826141357, "learning_rate": 2.986798679867987e-05, "loss": 0.2135, "num_input_tokens_seen": 2550592, "step": 2535 }, { "epoch": 1.1975483262611974, "grad_norm": 1.266893982887268, "learning_rate": 2.992692126355493e-05, "loss": 0.1192, "num_input_tokens_seen": 2554912, "step": 2540 }, { "epoch": 1.1999057048561999, "grad_norm": 1.2959954738616943, "learning_rate": 2.9985855728429985e-05, "loss": 0.1427, "num_input_tokens_seen": 2560064, "step": 2545 }, { "epoch": 1.2022630834512023, "grad_norm": 0.7352696657180786, "learning_rate": 3.0044790193305045e-05, "loss": 0.1616, "num_input_tokens_seen": 2565184, "step": 2550 }, { "epoch": 1.2046204620462047, "grad_norm": 0.15875688195228577, "learning_rate": 3.0103724658180104e-05, "loss": 0.3848, "num_input_tokens_seen": 2569696, "step": 2555 }, { "epoch": 1.206977840641207, "grad_norm": 0.42251425981521606, "learning_rate": 3.0162659123055164e-05, "loss": 0.1893, "num_input_tokens_seen": 2574112, "step": 2560 }, { "epoch": 1.2093352192362095, "grad_norm": 1.7705460786819458, "learning_rate": 3.0221593587930224e-05, "loss": 0.2794, "num_input_tokens_seen": 2578688, "step": 2565 }, { "epoch": 1.2116925978312116, "grad_norm": 1.0794597864151, "learning_rate": 3.028052805280528e-05, "loss": 0.1863, "num_input_tokens_seen": 2583552, "step": 2570 }, { "epoch": 1.214049976426214, "grad_norm": 0.41998910903930664, "learning_rate": 3.033946251768034e-05, "loss": 0.236, "num_input_tokens_seen": 2588384, "step": 2575 }, { "epoch": 1.2164073550212164, "grad_norm": 2.864000082015991, "learning_rate": 3.03983969825554e-05, "loss": 0.2431, "num_input_tokens_seen": 2593504, "step": 2580 }, { "epoch": 1.2187647336162188, "grad_norm": 0.4711858332157135, "learning_rate": 3.045733144743046e-05, "loss": 0.0945, "num_input_tokens_seen": 2598752, "step": 2585 }, { "epoch": 1.221122112211221, "grad_norm": 0.6976082921028137, "learning_rate": 3.051626591230552e-05, "loss": 0.2637, "num_input_tokens_seen": 2604096, "step": 2590 }, { "epoch": 1.2234794908062234, "grad_norm": 0.25299790501594543, "learning_rate": 3.057520037718057e-05, "loss": 0.1845, "num_input_tokens_seen": 2608960, "step": 2595 }, { "epoch": 1.2258368694012258, "grad_norm": 1.2545273303985596, "learning_rate": 3.063413484205563e-05, "loss": 0.3413, "num_input_tokens_seen": 2614528, "step": 2600 }, { "epoch": 1.2281942479962282, "grad_norm": 0.3317265510559082, "learning_rate": 3.06930693069307e-05, "loss": 0.1082, "num_input_tokens_seen": 2619712, "step": 2605 }, { "epoch": 1.2305516265912306, "grad_norm": 1.2086760997772217, "learning_rate": 3.075200377180576e-05, "loss": 0.1438, "num_input_tokens_seen": 2624096, "step": 2610 }, { "epoch": 1.232909005186233, "grad_norm": 1.9184798002243042, "learning_rate": 3.0810938236680817e-05, "loss": 0.2804, "num_input_tokens_seen": 2628256, "step": 2615 }, { "epoch": 1.2352663837812352, "grad_norm": 1.3426469564437866, "learning_rate": 3.086987270155587e-05, "loss": 0.2549, "num_input_tokens_seen": 2632448, "step": 2620 }, { "epoch": 1.2376237623762376, "grad_norm": 1.4734069108963013, "learning_rate": 3.092880716643093e-05, "loss": 0.1158, "num_input_tokens_seen": 2637696, "step": 2625 }, { "epoch": 1.23998114097124, "grad_norm": 0.854918360710144, "learning_rate": 3.098774163130599e-05, "loss": 0.0608, "num_input_tokens_seen": 2642240, "step": 2630 }, { "epoch": 1.2423385195662424, "grad_norm": 0.09384822100400925, "learning_rate": 3.104667609618105e-05, "loss": 0.1709, "num_input_tokens_seen": 2647040, "step": 2635 }, { "epoch": 1.2446958981612446, "grad_norm": 0.1485861837863922, "learning_rate": 3.110561056105611e-05, "loss": 0.0587, "num_input_tokens_seen": 2652032, "step": 2640 }, { "epoch": 1.247053276756247, "grad_norm": 4.678390979766846, "learning_rate": 3.116454502593116e-05, "loss": 0.3198, "num_input_tokens_seen": 2656768, "step": 2645 }, { "epoch": 1.2494106553512494, "grad_norm": 0.6616905331611633, "learning_rate": 3.122347949080622e-05, "loss": 0.1711, "num_input_tokens_seen": 2663008, "step": 2650 }, { "epoch": 1.2517680339462518, "grad_norm": 1.0242929458618164, "learning_rate": 3.128241395568128e-05, "loss": 0.09, "num_input_tokens_seen": 2668704, "step": 2655 }, { "epoch": 1.2541254125412542, "grad_norm": 0.45965448021888733, "learning_rate": 3.1341348420556346e-05, "loss": 0.2109, "num_input_tokens_seen": 2673056, "step": 2660 }, { "epoch": 1.2564827911362566, "grad_norm": 1.892600417137146, "learning_rate": 3.1400282885431406e-05, "loss": 0.2679, "num_input_tokens_seen": 2677760, "step": 2665 }, { "epoch": 1.2588401697312588, "grad_norm": 2.399582862854004, "learning_rate": 3.145921735030646e-05, "loss": 0.1996, "num_input_tokens_seen": 2682560, "step": 2670 }, { "epoch": 1.2611975483262612, "grad_norm": 1.887198805809021, "learning_rate": 3.151815181518152e-05, "loss": 0.1452, "num_input_tokens_seen": 2688000, "step": 2675 }, { "epoch": 1.2635549269212636, "grad_norm": 1.744476318359375, "learning_rate": 3.157708628005658e-05, "loss": 0.2253, "num_input_tokens_seen": 2692832, "step": 2680 }, { "epoch": 1.265912305516266, "grad_norm": 1.3631737232208252, "learning_rate": 3.163602074493164e-05, "loss": 0.1981, "num_input_tokens_seen": 2698528, "step": 2685 }, { "epoch": 1.2682696841112682, "grad_norm": 0.8083083033561707, "learning_rate": 3.16949552098067e-05, "loss": 0.1174, "num_input_tokens_seen": 2702336, "step": 2690 }, { "epoch": 1.2706270627062706, "grad_norm": 2.323399305343628, "learning_rate": 3.175388967468175e-05, "loss": 0.2298, "num_input_tokens_seen": 2707008, "step": 2695 }, { "epoch": 1.272984441301273, "grad_norm": 1.6808308362960815, "learning_rate": 3.181282413955681e-05, "loss": 0.4129, "num_input_tokens_seen": 2711392, "step": 2700 }, { "epoch": 1.2753418198962754, "grad_norm": 0.571773111820221, "learning_rate": 3.187175860443187e-05, "loss": 0.3058, "num_input_tokens_seen": 2716000, "step": 2705 }, { "epoch": 1.2776991984912778, "grad_norm": 1.7477086782455444, "learning_rate": 3.1930693069306936e-05, "loss": 0.1856, "num_input_tokens_seen": 2720736, "step": 2710 }, { "epoch": 1.2800565770862802, "grad_norm": 0.1533554345369339, "learning_rate": 3.1989627534181996e-05, "loss": 0.1979, "num_input_tokens_seen": 2725824, "step": 2715 }, { "epoch": 1.2824139556812824, "grad_norm": 0.9659508466720581, "learning_rate": 3.204856199905705e-05, "loss": 0.1629, "num_input_tokens_seen": 2731168, "step": 2720 }, { "epoch": 1.2847713342762848, "grad_norm": 0.6259616613388062, "learning_rate": 3.210749646393211e-05, "loss": 0.1152, "num_input_tokens_seen": 2736128, "step": 2725 }, { "epoch": 1.2871287128712872, "grad_norm": 0.2313687950372696, "learning_rate": 3.216643092880717e-05, "loss": 0.2051, "num_input_tokens_seen": 2740256, "step": 2730 }, { "epoch": 1.2894860914662896, "grad_norm": 1.8403263092041016, "learning_rate": 3.222536539368223e-05, "loss": 0.2841, "num_input_tokens_seen": 2745728, "step": 2735 }, { "epoch": 1.2918434700612917, "grad_norm": 3.3388798236846924, "learning_rate": 3.228429985855729e-05, "loss": 0.2234, "num_input_tokens_seen": 2750368, "step": 2740 }, { "epoch": 1.2942008486562941, "grad_norm": 1.2079490423202515, "learning_rate": 3.234323432343234e-05, "loss": 0.3475, "num_input_tokens_seen": 2755520, "step": 2745 }, { "epoch": 1.2965582272512965, "grad_norm": 3.8393959999084473, "learning_rate": 3.24021687883074e-05, "loss": 0.2435, "num_input_tokens_seen": 2760192, "step": 2750 }, { "epoch": 1.298915605846299, "grad_norm": 0.5460264086723328, "learning_rate": 3.246110325318246e-05, "loss": 0.3016, "num_input_tokens_seen": 2765920, "step": 2755 }, { "epoch": 1.3012729844413014, "grad_norm": 0.3466709554195404, "learning_rate": 3.2520037718057526e-05, "loss": 0.1226, "num_input_tokens_seen": 2770624, "step": 2760 }, { "epoch": 1.3036303630363038, "grad_norm": 0.932809054851532, "learning_rate": 3.2578972182932585e-05, "loss": 0.2106, "num_input_tokens_seen": 2776960, "step": 2765 }, { "epoch": 1.305987741631306, "grad_norm": 0.7421944737434387, "learning_rate": 3.263790664780764e-05, "loss": 0.1392, "num_input_tokens_seen": 2782048, "step": 2770 }, { "epoch": 1.3083451202263083, "grad_norm": 1.7077662944793701, "learning_rate": 3.26968411126827e-05, "loss": 0.1255, "num_input_tokens_seen": 2787520, "step": 2775 }, { "epoch": 1.3107024988213107, "grad_norm": 0.6553915739059448, "learning_rate": 3.275577557755776e-05, "loss": 0.1106, "num_input_tokens_seen": 2792128, "step": 2780 }, { "epoch": 1.3130598774163131, "grad_norm": 0.12469520419836044, "learning_rate": 3.281471004243282e-05, "loss": 0.2465, "num_input_tokens_seen": 2797920, "step": 2785 }, { "epoch": 1.3154172560113153, "grad_norm": 1.576668381690979, "learning_rate": 3.2873644507307877e-05, "loss": 0.1842, "num_input_tokens_seen": 2802368, "step": 2790 }, { "epoch": 1.3177746346063177, "grad_norm": 0.34178346395492554, "learning_rate": 3.293257897218293e-05, "loss": 0.221, "num_input_tokens_seen": 2806592, "step": 2795 }, { "epoch": 1.3201320132013201, "grad_norm": 2.366925001144409, "learning_rate": 3.299151343705799e-05, "loss": 0.1556, "num_input_tokens_seen": 2811648, "step": 2800 }, { "epoch": 1.3224893917963225, "grad_norm": 0.4956008195877075, "learning_rate": 3.305044790193305e-05, "loss": 0.2001, "num_input_tokens_seen": 2817088, "step": 2805 }, { "epoch": 1.324846770391325, "grad_norm": 0.10547614842653275, "learning_rate": 3.310938236680811e-05, "loss": 0.1427, "num_input_tokens_seen": 2822240, "step": 2810 }, { "epoch": 1.3272041489863273, "grad_norm": 1.7753705978393555, "learning_rate": 3.3168316831683175e-05, "loss": 0.1845, "num_input_tokens_seen": 2826400, "step": 2815 }, { "epoch": 1.3295615275813295, "grad_norm": 3.67834210395813, "learning_rate": 3.322725129655823e-05, "loss": 0.3437, "num_input_tokens_seen": 2832032, "step": 2820 }, { "epoch": 1.331918906176332, "grad_norm": 2.1836016178131104, "learning_rate": 3.328618576143329e-05, "loss": 0.3661, "num_input_tokens_seen": 2840000, "step": 2825 }, { "epoch": 1.3342762847713343, "grad_norm": 0.9033606052398682, "learning_rate": 3.334512022630835e-05, "loss": 0.2404, "num_input_tokens_seen": 2845312, "step": 2830 }, { "epoch": 1.3366336633663367, "grad_norm": 5.232038497924805, "learning_rate": 3.3404054691183406e-05, "loss": 0.3893, "num_input_tokens_seen": 2849984, "step": 2835 }, { "epoch": 1.338991041961339, "grad_norm": 0.46881064772605896, "learning_rate": 3.3462989156058466e-05, "loss": 0.1307, "num_input_tokens_seen": 2854944, "step": 2840 }, { "epoch": 1.3413484205563413, "grad_norm": 1.0383524894714355, "learning_rate": 3.352192362093352e-05, "loss": 0.148, "num_input_tokens_seen": 2859776, "step": 2845 }, { "epoch": 1.3437057991513437, "grad_norm": 1.5325875282287598, "learning_rate": 3.358085808580858e-05, "loss": 0.1417, "num_input_tokens_seen": 2865472, "step": 2850 }, { "epoch": 1.346063177746346, "grad_norm": 0.8939050436019897, "learning_rate": 3.363979255068364e-05, "loss": 0.1871, "num_input_tokens_seen": 2870560, "step": 2855 }, { "epoch": 1.3484205563413485, "grad_norm": 2.091658353805542, "learning_rate": 3.36987270155587e-05, "loss": 0.1038, "num_input_tokens_seen": 2874592, "step": 2860 }, { "epoch": 1.350777934936351, "grad_norm": 0.7085261344909668, "learning_rate": 3.3757661480433764e-05, "loss": 0.151, "num_input_tokens_seen": 2879264, "step": 2865 }, { "epoch": 1.353135313531353, "grad_norm": 0.22288796305656433, "learning_rate": 3.381659594530882e-05, "loss": 0.0987, "num_input_tokens_seen": 2884416, "step": 2870 }, { "epoch": 1.3554926921263555, "grad_norm": 2.0646069049835205, "learning_rate": 3.387553041018388e-05, "loss": 0.1138, "num_input_tokens_seen": 2889088, "step": 2875 }, { "epoch": 1.3578500707213579, "grad_norm": 4.108591079711914, "learning_rate": 3.3934464875058936e-05, "loss": 0.2459, "num_input_tokens_seen": 2893088, "step": 2880 }, { "epoch": 1.3602074493163603, "grad_norm": 0.08127786964178085, "learning_rate": 3.3993399339933996e-05, "loss": 0.0998, "num_input_tokens_seen": 2899776, "step": 2885 }, { "epoch": 1.3625648279113625, "grad_norm": 0.3557914197444916, "learning_rate": 3.4052333804809056e-05, "loss": 0.3344, "num_input_tokens_seen": 2905536, "step": 2890 }, { "epoch": 1.3649222065063649, "grad_norm": 9.296682357788086, "learning_rate": 3.411126826968411e-05, "loss": 0.1775, "num_input_tokens_seen": 2910240, "step": 2895 }, { "epoch": 1.3672795851013673, "grad_norm": 2.789764881134033, "learning_rate": 3.417020273455917e-05, "loss": 0.2822, "num_input_tokens_seen": 2915040, "step": 2900 }, { "epoch": 1.3696369636963697, "grad_norm": 0.659432053565979, "learning_rate": 3.422913719943423e-05, "loss": 0.3562, "num_input_tokens_seen": 2919968, "step": 2905 }, { "epoch": 1.371994342291372, "grad_norm": 0.36918559670448303, "learning_rate": 3.428807166430929e-05, "loss": 0.1204, "num_input_tokens_seen": 2924992, "step": 2910 }, { "epoch": 1.3743517208863745, "grad_norm": 2.6821694374084473, "learning_rate": 3.434700612918435e-05, "loss": 0.218, "num_input_tokens_seen": 2930912, "step": 2915 }, { "epoch": 1.3767090994813767, "grad_norm": 2.3283851146698, "learning_rate": 3.440594059405941e-05, "loss": 0.1747, "num_input_tokens_seen": 2935744, "step": 2920 }, { "epoch": 1.379066478076379, "grad_norm": 1.6142903566360474, "learning_rate": 3.4464875058934466e-05, "loss": 0.4606, "num_input_tokens_seen": 2940256, "step": 2925 }, { "epoch": 1.3814238566713815, "grad_norm": 0.3647494912147522, "learning_rate": 3.4523809523809526e-05, "loss": 0.1528, "num_input_tokens_seen": 2945696, "step": 2930 }, { "epoch": 1.3837812352663839, "grad_norm": 1.4834977388381958, "learning_rate": 3.4582743988684586e-05, "loss": 0.3017, "num_input_tokens_seen": 2950304, "step": 2935 }, { "epoch": 1.386138613861386, "grad_norm": 0.5800021886825562, "learning_rate": 3.4641678453559645e-05, "loss": 0.1651, "num_input_tokens_seen": 2955200, "step": 2940 }, { "epoch": 1.3884959924563884, "grad_norm": 0.44167929887771606, "learning_rate": 3.47006129184347e-05, "loss": 0.2846, "num_input_tokens_seen": 2960256, "step": 2945 }, { "epoch": 1.3908533710513908, "grad_norm": 0.0694163516163826, "learning_rate": 3.475954738330976e-05, "loss": 0.08, "num_input_tokens_seen": 2964704, "step": 2950 }, { "epoch": 1.3932107496463932, "grad_norm": 1.4023438692092896, "learning_rate": 3.481848184818482e-05, "loss": 0.2769, "num_input_tokens_seen": 2969312, "step": 2955 }, { "epoch": 1.3955681282413956, "grad_norm": 3.5922787189483643, "learning_rate": 3.487741631305988e-05, "loss": 0.233, "num_input_tokens_seen": 2973728, "step": 2960 }, { "epoch": 1.397925506836398, "grad_norm": 0.7690333724021912, "learning_rate": 3.4936350777934937e-05, "loss": 0.3202, "num_input_tokens_seen": 2979424, "step": 2965 }, { "epoch": 1.4002828854314002, "grad_norm": 1.0257821083068848, "learning_rate": 3.4995285242809996e-05, "loss": 0.0514, "num_input_tokens_seen": 2984608, "step": 2970 }, { "epoch": 1.4026402640264026, "grad_norm": 0.8002281188964844, "learning_rate": 3.5054219707685056e-05, "loss": 0.0863, "num_input_tokens_seen": 2989248, "step": 2975 }, { "epoch": 1.404997642621405, "grad_norm": 0.6836122274398804, "learning_rate": 3.5113154172560115e-05, "loss": 0.2726, "num_input_tokens_seen": 2993248, "step": 2980 }, { "epoch": 1.4073550212164074, "grad_norm": 0.5996866822242737, "learning_rate": 3.5172088637435175e-05, "loss": 0.1436, "num_input_tokens_seen": 2997920, "step": 2985 }, { "epoch": 1.4097123998114096, "grad_norm": 0.6665914058685303, "learning_rate": 3.5231023102310235e-05, "loss": 0.2163, "num_input_tokens_seen": 3003776, "step": 2990 }, { "epoch": 1.412069778406412, "grad_norm": 0.7995374202728271, "learning_rate": 3.528995756718529e-05, "loss": 0.2643, "num_input_tokens_seen": 3008096, "step": 2995 }, { "epoch": 1.4144271570014144, "grad_norm": 0.6133872866630554, "learning_rate": 3.534889203206035e-05, "loss": 0.064, "num_input_tokens_seen": 3013152, "step": 3000 }, { "epoch": 1.4167845355964168, "grad_norm": 0.8769453763961792, "learning_rate": 3.540782649693541e-05, "loss": 0.133, "num_input_tokens_seen": 3017088, "step": 3005 }, { "epoch": 1.4191419141914192, "grad_norm": 2.653911590576172, "learning_rate": 3.5466760961810467e-05, "loss": 0.3245, "num_input_tokens_seen": 3021824, "step": 3010 }, { "epoch": 1.4214992927864216, "grad_norm": 1.856600046157837, "learning_rate": 3.5525695426685526e-05, "loss": 0.2222, "num_input_tokens_seen": 3026144, "step": 3015 }, { "epoch": 1.4238566713814238, "grad_norm": 0.19194623827934265, "learning_rate": 3.5584629891560586e-05, "loss": 0.0537, "num_input_tokens_seen": 3030784, "step": 3020 }, { "epoch": 1.4262140499764262, "grad_norm": 0.28840455412864685, "learning_rate": 3.5643564356435645e-05, "loss": 0.1737, "num_input_tokens_seen": 3035328, "step": 3025 }, { "epoch": 1.4285714285714286, "grad_norm": 0.1362939327955246, "learning_rate": 3.5702498821310705e-05, "loss": 0.3717, "num_input_tokens_seen": 3040224, "step": 3030 }, { "epoch": 1.430928807166431, "grad_norm": 2.447685956954956, "learning_rate": 3.5761433286185765e-05, "loss": 0.2187, "num_input_tokens_seen": 3045344, "step": 3035 }, { "epoch": 1.4332861857614332, "grad_norm": 0.8633464574813843, "learning_rate": 3.5820367751060824e-05, "loss": 0.1488, "num_input_tokens_seen": 3049376, "step": 3040 }, { "epoch": 1.4356435643564356, "grad_norm": 1.4918079376220703, "learning_rate": 3.587930221593588e-05, "loss": 0.1562, "num_input_tokens_seen": 3054944, "step": 3045 }, { "epoch": 1.438000942951438, "grad_norm": 12.033173561096191, "learning_rate": 3.593823668081094e-05, "loss": 0.1922, "num_input_tokens_seen": 3060672, "step": 3050 }, { "epoch": 1.4403583215464404, "grad_norm": 0.07517169415950775, "learning_rate": 3.5997171145685996e-05, "loss": 0.2601, "num_input_tokens_seen": 3065056, "step": 3055 }, { "epoch": 1.4427157001414428, "grad_norm": 2.9714865684509277, "learning_rate": 3.6056105610561056e-05, "loss": 0.1398, "num_input_tokens_seen": 3069536, "step": 3060 }, { "epoch": 1.4450730787364452, "grad_norm": 0.5736239552497864, "learning_rate": 3.6115040075436116e-05, "loss": 0.2525, "num_input_tokens_seen": 3074560, "step": 3065 }, { "epoch": 1.4474304573314474, "grad_norm": 2.800243854522705, "learning_rate": 3.6173974540311175e-05, "loss": 0.2915, "num_input_tokens_seen": 3078688, "step": 3070 }, { "epoch": 1.4497878359264498, "grad_norm": 0.9045893549919128, "learning_rate": 3.6232909005186235e-05, "loss": 0.1606, "num_input_tokens_seen": 3083552, "step": 3075 }, { "epoch": 1.4521452145214522, "grad_norm": 3.524801015853882, "learning_rate": 3.6291843470061295e-05, "loss": 0.3675, "num_input_tokens_seen": 3088352, "step": 3080 }, { "epoch": 1.4545025931164546, "grad_norm": 2.5320074558258057, "learning_rate": 3.6350777934936354e-05, "loss": 0.3275, "num_input_tokens_seen": 3092960, "step": 3085 }, { "epoch": 1.4568599717114568, "grad_norm": 0.552950382232666, "learning_rate": 3.6409712399811414e-05, "loss": 0.1466, "num_input_tokens_seen": 3098592, "step": 3090 }, { "epoch": 1.4592173503064592, "grad_norm": 0.5482760667800903, "learning_rate": 3.646864686468647e-05, "loss": 0.0944, "num_input_tokens_seen": 3103840, "step": 3095 }, { "epoch": 1.4615747289014616, "grad_norm": 1.2679911851882935, "learning_rate": 3.6527581329561526e-05, "loss": 0.0922, "num_input_tokens_seen": 3109312, "step": 3100 }, { "epoch": 1.463932107496464, "grad_norm": 1.5703272819519043, "learning_rate": 3.6586515794436586e-05, "loss": 0.2874, "num_input_tokens_seen": 3113984, "step": 3105 }, { "epoch": 1.4662894860914664, "grad_norm": 0.7192265391349792, "learning_rate": 3.6645450259311646e-05, "loss": 0.1784, "num_input_tokens_seen": 3118400, "step": 3110 }, { "epoch": 1.4686468646864688, "grad_norm": 1.2466509342193604, "learning_rate": 3.6704384724186705e-05, "loss": 0.194, "num_input_tokens_seen": 3123488, "step": 3115 }, { "epoch": 1.471004243281471, "grad_norm": 2.741786003112793, "learning_rate": 3.6763319189061765e-05, "loss": 0.4643, "num_input_tokens_seen": 3128256, "step": 3120 }, { "epoch": 1.4733616218764733, "grad_norm": 2.174567699432373, "learning_rate": 3.6822253653936824e-05, "loss": 0.3057, "num_input_tokens_seen": 3133088, "step": 3125 }, { "epoch": 1.4757190004714758, "grad_norm": 0.4260793626308441, "learning_rate": 3.6881188118811884e-05, "loss": 0.1826, "num_input_tokens_seen": 3138336, "step": 3130 }, { "epoch": 1.4780763790664782, "grad_norm": 0.8904929757118225, "learning_rate": 3.6940122583686944e-05, "loss": 0.1673, "num_input_tokens_seen": 3142912, "step": 3135 }, { "epoch": 1.4804337576614803, "grad_norm": 0.15678775310516357, "learning_rate": 3.6999057048562003e-05, "loss": 0.0362, "num_input_tokens_seen": 3148256, "step": 3140 }, { "epoch": 1.4827911362564827, "grad_norm": 0.122397780418396, "learning_rate": 3.7057991513437056e-05, "loss": 0.1538, "num_input_tokens_seen": 3152800, "step": 3145 }, { "epoch": 1.4851485148514851, "grad_norm": 0.7512156367301941, "learning_rate": 3.7116925978312116e-05, "loss": 0.1385, "num_input_tokens_seen": 3158432, "step": 3150 }, { "epoch": 1.4875058934464875, "grad_norm": 1.272525668144226, "learning_rate": 3.7175860443187176e-05, "loss": 0.1162, "num_input_tokens_seen": 3162176, "step": 3155 }, { "epoch": 1.48986327204149, "grad_norm": 0.9283604621887207, "learning_rate": 3.7234794908062235e-05, "loss": 0.1272, "num_input_tokens_seen": 3167008, "step": 3160 }, { "epoch": 1.4922206506364923, "grad_norm": 0.4484657347202301, "learning_rate": 3.7293729372937295e-05, "loss": 0.1279, "num_input_tokens_seen": 3173056, "step": 3165 }, { "epoch": 1.4945780292314945, "grad_norm": 1.3666770458221436, "learning_rate": 3.7352663837812354e-05, "loss": 0.3029, "num_input_tokens_seen": 3178624, "step": 3170 }, { "epoch": 1.496935407826497, "grad_norm": 0.1426973044872284, "learning_rate": 3.7411598302687414e-05, "loss": 0.1916, "num_input_tokens_seen": 3183744, "step": 3175 }, { "epoch": 1.4992927864214993, "grad_norm": 3.6092469692230225, "learning_rate": 3.7470532767562474e-05, "loss": 0.1416, "num_input_tokens_seen": 3190240, "step": 3180 }, { "epoch": 1.5016501650165015, "grad_norm": 0.9299961924552917, "learning_rate": 3.752946723243753e-05, "loss": 0.1629, "num_input_tokens_seen": 3196864, "step": 3185 }, { "epoch": 1.504007543611504, "grad_norm": 0.5867211818695068, "learning_rate": 3.758840169731259e-05, "loss": 0.2149, "num_input_tokens_seen": 3202144, "step": 3190 }, { "epoch": 1.5063649222065063, "grad_norm": 2.36503529548645, "learning_rate": 3.764733616218765e-05, "loss": 0.7549, "num_input_tokens_seen": 3207680, "step": 3195 }, { "epoch": 1.5087223008015087, "grad_norm": 1.3552614450454712, "learning_rate": 3.7706270627062705e-05, "loss": 0.1651, "num_input_tokens_seen": 3212864, "step": 3200 }, { "epoch": 1.511079679396511, "grad_norm": 1.5907313823699951, "learning_rate": 3.7765205091937765e-05, "loss": 0.2348, "num_input_tokens_seen": 3218528, "step": 3205 }, { "epoch": 1.5134370579915135, "grad_norm": 0.27521461248397827, "learning_rate": 3.7824139556812825e-05, "loss": 0.1801, "num_input_tokens_seen": 3225152, "step": 3210 }, { "epoch": 1.515794436586516, "grad_norm": 0.45545250177383423, "learning_rate": 3.7883074021687884e-05, "loss": 0.1434, "num_input_tokens_seen": 3230272, "step": 3215 }, { "epoch": 1.5181518151815183, "grad_norm": 0.06455105543136597, "learning_rate": 3.7942008486562944e-05, "loss": 0.2873, "num_input_tokens_seen": 3235584, "step": 3220 }, { "epoch": 1.5205091937765205, "grad_norm": 0.17050468921661377, "learning_rate": 3.8000942951438004e-05, "loss": 0.1418, "num_input_tokens_seen": 3240064, "step": 3225 }, { "epoch": 1.522866572371523, "grad_norm": 0.27639010548591614, "learning_rate": 3.805987741631306e-05, "loss": 0.1365, "num_input_tokens_seen": 3244704, "step": 3230 }, { "epoch": 1.525223950966525, "grad_norm": 1.243996500968933, "learning_rate": 3.811881188118812e-05, "loss": 0.1875, "num_input_tokens_seen": 3248992, "step": 3235 }, { "epoch": 1.5275813295615275, "grad_norm": 0.5287883877754211, "learning_rate": 3.817774634606318e-05, "loss": 0.3738, "num_input_tokens_seen": 3254368, "step": 3240 }, { "epoch": 1.5299387081565299, "grad_norm": 0.4648299217224121, "learning_rate": 3.823668081093824e-05, "loss": 0.077, "num_input_tokens_seen": 3259488, "step": 3245 }, { "epoch": 1.5322960867515323, "grad_norm": 0.15337669849395752, "learning_rate": 3.8295615275813295e-05, "loss": 0.2672, "num_input_tokens_seen": 3264224, "step": 3250 }, { "epoch": 1.5346534653465347, "grad_norm": 2.431241035461426, "learning_rate": 3.8354549740688355e-05, "loss": 0.3221, "num_input_tokens_seen": 3269888, "step": 3255 }, { "epoch": 1.537010843941537, "grad_norm": 1.485511302947998, "learning_rate": 3.8413484205563414e-05, "loss": 0.2261, "num_input_tokens_seen": 3275040, "step": 3260 }, { "epoch": 1.5393682225365395, "grad_norm": 0.1775880604982376, "learning_rate": 3.8472418670438474e-05, "loss": 0.0279, "num_input_tokens_seen": 3279712, "step": 3265 }, { "epoch": 1.541725601131542, "grad_norm": 2.397620916366577, "learning_rate": 3.8531353135313534e-05, "loss": 0.3793, "num_input_tokens_seen": 3283776, "step": 3270 }, { "epoch": 1.544082979726544, "grad_norm": 0.20307494699954987, "learning_rate": 3.859028760018859e-05, "loss": 0.2034, "num_input_tokens_seen": 3288928, "step": 3275 }, { "epoch": 1.5464403583215465, "grad_norm": 0.5163267850875854, "learning_rate": 3.864922206506365e-05, "loss": 0.1766, "num_input_tokens_seen": 3293504, "step": 3280 }, { "epoch": 1.5487977369165487, "grad_norm": 2.618396759033203, "learning_rate": 3.870815652993871e-05, "loss": 0.1817, "num_input_tokens_seen": 3297792, "step": 3285 }, { "epoch": 1.551155115511551, "grad_norm": 1.4251598119735718, "learning_rate": 3.876709099481377e-05, "loss": 0.1087, "num_input_tokens_seen": 3304736, "step": 3290 }, { "epoch": 1.5535124941065535, "grad_norm": 0.9577255249023438, "learning_rate": 3.882602545968883e-05, "loss": 0.3212, "num_input_tokens_seen": 3310048, "step": 3295 }, { "epoch": 1.5558698727015559, "grad_norm": 0.6275179386138916, "learning_rate": 3.8884959924563885e-05, "loss": 0.1803, "num_input_tokens_seen": 3315328, "step": 3300 }, { "epoch": 1.5582272512965583, "grad_norm": 0.5972018241882324, "learning_rate": 3.8943894389438944e-05, "loss": 0.3587, "num_input_tokens_seen": 3320416, "step": 3305 }, { "epoch": 1.5605846298915607, "grad_norm": 1.1082106828689575, "learning_rate": 3.9002828854314004e-05, "loss": 0.1102, "num_input_tokens_seen": 3325056, "step": 3310 }, { "epoch": 1.562942008486563, "grad_norm": 0.8044413328170776, "learning_rate": 3.9061763319189063e-05, "loss": 0.1581, "num_input_tokens_seen": 3328672, "step": 3315 }, { "epoch": 1.5652993870815655, "grad_norm": 0.021912558004260063, "learning_rate": 3.912069778406412e-05, "loss": 0.1692, "num_input_tokens_seen": 3333376, "step": 3320 }, { "epoch": 1.5676567656765676, "grad_norm": 0.15745213627815247, "learning_rate": 3.917963224893918e-05, "loss": 0.2747, "num_input_tokens_seen": 3339424, "step": 3325 }, { "epoch": 1.57001414427157, "grad_norm": 0.842227041721344, "learning_rate": 3.923856671381424e-05, "loss": 0.206, "num_input_tokens_seen": 3343968, "step": 3330 }, { "epoch": 1.5723715228665722, "grad_norm": 1.4458626508712769, "learning_rate": 3.92975011786893e-05, "loss": 0.1555, "num_input_tokens_seen": 3349856, "step": 3335 }, { "epoch": 1.5747289014615746, "grad_norm": 1.1915189027786255, "learning_rate": 3.935643564356436e-05, "loss": 0.1255, "num_input_tokens_seen": 3355232, "step": 3340 }, { "epoch": 1.577086280056577, "grad_norm": 0.7065159678459167, "learning_rate": 3.941537010843942e-05, "loss": 0.0713, "num_input_tokens_seen": 3360288, "step": 3345 }, { "epoch": 1.5794436586515794, "grad_norm": 0.24723009765148163, "learning_rate": 3.9474304573314474e-05, "loss": 0.4066, "num_input_tokens_seen": 3364768, "step": 3350 }, { "epoch": 1.5818010372465818, "grad_norm": 1.7274233102798462, "learning_rate": 3.9533239038189534e-05, "loss": 0.2224, "num_input_tokens_seen": 3369632, "step": 3355 }, { "epoch": 1.5841584158415842, "grad_norm": 0.6670981049537659, "learning_rate": 3.959217350306459e-05, "loss": 0.1434, "num_input_tokens_seen": 3375264, "step": 3360 }, { "epoch": 1.5865157944365866, "grad_norm": 0.2599755823612213, "learning_rate": 3.965110796793965e-05, "loss": 0.0848, "num_input_tokens_seen": 3380064, "step": 3365 }, { "epoch": 1.588873173031589, "grad_norm": 0.09369120001792908, "learning_rate": 3.971004243281471e-05, "loss": 0.0757, "num_input_tokens_seen": 3384768, "step": 3370 }, { "epoch": 1.5912305516265912, "grad_norm": 1.1360499858856201, "learning_rate": 3.976897689768977e-05, "loss": 0.1806, "num_input_tokens_seen": 3388832, "step": 3375 }, { "epoch": 1.5935879302215936, "grad_norm": 0.9039750695228577, "learning_rate": 3.982791136256483e-05, "loss": 0.1452, "num_input_tokens_seen": 3392896, "step": 3380 }, { "epoch": 1.5959453088165958, "grad_norm": 1.6920968294143677, "learning_rate": 3.988684582743989e-05, "loss": 0.2301, "num_input_tokens_seen": 3398144, "step": 3385 }, { "epoch": 1.5983026874115982, "grad_norm": 0.9463897943496704, "learning_rate": 3.994578029231495e-05, "loss": 0.1512, "num_input_tokens_seen": 3403232, "step": 3390 }, { "epoch": 1.6006600660066006, "grad_norm": 0.4202144145965576, "learning_rate": 4.000471475719001e-05, "loss": 0.2082, "num_input_tokens_seen": 3408128, "step": 3395 }, { "epoch": 1.603017444601603, "grad_norm": 0.44460129737854004, "learning_rate": 4.0063649222065064e-05, "loss": 0.1216, "num_input_tokens_seen": 3412864, "step": 3400 }, { "epoch": 1.6053748231966054, "grad_norm": 0.21490804851055145, "learning_rate": 4.012258368694012e-05, "loss": 0.2849, "num_input_tokens_seen": 3418016, "step": 3405 }, { "epoch": 1.6077322017916078, "grad_norm": 1.0972545146942139, "learning_rate": 4.018151815181518e-05, "loss": 0.1243, "num_input_tokens_seen": 3422688, "step": 3410 }, { "epoch": 1.6100895803866102, "grad_norm": 0.2605578601360321, "learning_rate": 4.024045261669024e-05, "loss": 0.1473, "num_input_tokens_seen": 3428192, "step": 3415 }, { "epoch": 1.6124469589816126, "grad_norm": 2.8981759548187256, "learning_rate": 4.02993870815653e-05, "loss": 0.2863, "num_input_tokens_seen": 3432960, "step": 3420 }, { "epoch": 1.6148043375766148, "grad_norm": 0.3772743344306946, "learning_rate": 4.035832154644036e-05, "loss": 0.1922, "num_input_tokens_seen": 3437664, "step": 3425 }, { "epoch": 1.6171617161716172, "grad_norm": 1.4845248460769653, "learning_rate": 4.041725601131542e-05, "loss": 0.336, "num_input_tokens_seen": 3443008, "step": 3430 }, { "epoch": 1.6195190947666194, "grad_norm": 0.4726199209690094, "learning_rate": 4.047619047619048e-05, "loss": 0.107, "num_input_tokens_seen": 3447648, "step": 3435 }, { "epoch": 1.6218764733616218, "grad_norm": 1.7444570064544678, "learning_rate": 4.053512494106554e-05, "loss": 0.2047, "num_input_tokens_seen": 3452032, "step": 3440 }, { "epoch": 1.6242338519566242, "grad_norm": 0.23078323900699615, "learning_rate": 4.05940594059406e-05, "loss": 0.1509, "num_input_tokens_seen": 3457408, "step": 3445 }, { "epoch": 1.6265912305516266, "grad_norm": 0.19455112516880035, "learning_rate": 4.065299387081565e-05, "loss": 0.1994, "num_input_tokens_seen": 3462400, "step": 3450 }, { "epoch": 1.628948609146629, "grad_norm": 0.726399302482605, "learning_rate": 4.071192833569071e-05, "loss": 0.3536, "num_input_tokens_seen": 3466816, "step": 3455 }, { "epoch": 1.6313059877416314, "grad_norm": 0.9709278345108032, "learning_rate": 4.077086280056577e-05, "loss": 0.1817, "num_input_tokens_seen": 3471744, "step": 3460 }, { "epoch": 1.6336633663366338, "grad_norm": 0.19504638016223907, "learning_rate": 4.082979726544083e-05, "loss": 0.1122, "num_input_tokens_seen": 3476608, "step": 3465 }, { "epoch": 1.6360207449316362, "grad_norm": 2.9861204624176025, "learning_rate": 4.088873173031589e-05, "loss": 0.1543, "num_input_tokens_seen": 3480544, "step": 3470 }, { "epoch": 1.6383781235266384, "grad_norm": 0.8013817071914673, "learning_rate": 4.094766619519095e-05, "loss": 0.2408, "num_input_tokens_seen": 3486944, "step": 3475 }, { "epoch": 1.6407355021216408, "grad_norm": 2.889164686203003, "learning_rate": 4.100660066006601e-05, "loss": 0.3887, "num_input_tokens_seen": 3492256, "step": 3480 }, { "epoch": 1.643092880716643, "grad_norm": 0.17256586253643036, "learning_rate": 4.106553512494107e-05, "loss": 0.0909, "num_input_tokens_seen": 3499072, "step": 3485 }, { "epoch": 1.6454502593116453, "grad_norm": 0.4101530909538269, "learning_rate": 4.112446958981613e-05, "loss": 0.2094, "num_input_tokens_seen": 3502912, "step": 3490 }, { "epoch": 1.6478076379066477, "grad_norm": 1.1821396350860596, "learning_rate": 4.118340405469119e-05, "loss": 0.2561, "num_input_tokens_seen": 3507808, "step": 3495 }, { "epoch": 1.6501650165016502, "grad_norm": 0.13997872173786163, "learning_rate": 4.124233851956624e-05, "loss": 0.2033, "num_input_tokens_seen": 3512896, "step": 3500 }, { "epoch": 1.6525223950966526, "grad_norm": 0.12110988795757294, "learning_rate": 4.13012729844413e-05, "loss": 0.214, "num_input_tokens_seen": 3517632, "step": 3505 }, { "epoch": 1.654879773691655, "grad_norm": 1.8233838081359863, "learning_rate": 4.136020744931636e-05, "loss": 0.1487, "num_input_tokens_seen": 3522464, "step": 3510 }, { "epoch": 1.6572371522866574, "grad_norm": 0.4674221873283386, "learning_rate": 4.141914191419142e-05, "loss": 0.1694, "num_input_tokens_seen": 3527008, "step": 3515 }, { "epoch": 1.6595945308816598, "grad_norm": 0.29243531823158264, "learning_rate": 4.147807637906648e-05, "loss": 0.2047, "num_input_tokens_seen": 3531584, "step": 3520 }, { "epoch": 1.661951909476662, "grad_norm": 1.2050355672836304, "learning_rate": 4.1537010843941534e-05, "loss": 0.1161, "num_input_tokens_seen": 3537920, "step": 3525 }, { "epoch": 1.6643092880716643, "grad_norm": 0.3306163549423218, "learning_rate": 4.15959453088166e-05, "loss": 0.1707, "num_input_tokens_seen": 3543360, "step": 3530 }, { "epoch": 1.6666666666666665, "grad_norm": 0.34741640090942383, "learning_rate": 4.165487977369166e-05, "loss": 0.2729, "num_input_tokens_seen": 3548160, "step": 3535 }, { "epoch": 1.669024045261669, "grad_norm": 0.1597115695476532, "learning_rate": 4.171381423856672e-05, "loss": 0.4044, "num_input_tokens_seen": 3552288, "step": 3540 }, { "epoch": 1.6713814238566713, "grad_norm": 1.1835615634918213, "learning_rate": 4.177274870344178e-05, "loss": 0.1268, "num_input_tokens_seen": 3556704, "step": 3545 }, { "epoch": 1.6737388024516737, "grad_norm": 1.1015055179595947, "learning_rate": 4.183168316831683e-05, "loss": 0.2321, "num_input_tokens_seen": 3561952, "step": 3550 }, { "epoch": 1.6760961810466761, "grad_norm": 2.8485348224639893, "learning_rate": 4.189061763319189e-05, "loss": 0.2324, "num_input_tokens_seen": 3566304, "step": 3555 }, { "epoch": 1.6784535596416785, "grad_norm": 2.08886981010437, "learning_rate": 4.194955209806695e-05, "loss": 0.1032, "num_input_tokens_seen": 3571648, "step": 3560 }, { "epoch": 1.680810938236681, "grad_norm": 1.1914710998535156, "learning_rate": 4.200848656294201e-05, "loss": 0.1453, "num_input_tokens_seen": 3577728, "step": 3565 }, { "epoch": 1.6831683168316833, "grad_norm": 0.36026084423065186, "learning_rate": 4.206742102781707e-05, "loss": 0.2624, "num_input_tokens_seen": 3582400, "step": 3570 }, { "epoch": 1.6855256954266855, "grad_norm": 0.8191189169883728, "learning_rate": 4.2126355492692124e-05, "loss": 0.2139, "num_input_tokens_seen": 3587968, "step": 3575 }, { "epoch": 1.687883074021688, "grad_norm": 0.37923741340637207, "learning_rate": 4.218528995756719e-05, "loss": 0.1524, "num_input_tokens_seen": 3593408, "step": 3580 }, { "epoch": 1.69024045261669, "grad_norm": 2.2285516262054443, "learning_rate": 4.224422442244225e-05, "loss": 0.377, "num_input_tokens_seen": 3598592, "step": 3585 }, { "epoch": 1.6925978312116925, "grad_norm": 0.1552746444940567, "learning_rate": 4.230315888731731e-05, "loss": 0.205, "num_input_tokens_seen": 3603872, "step": 3590 }, { "epoch": 1.694955209806695, "grad_norm": 5.575215816497803, "learning_rate": 4.236209335219237e-05, "loss": 0.6257, "num_input_tokens_seen": 3608448, "step": 3595 }, { "epoch": 1.6973125884016973, "grad_norm": 2.7500410079956055, "learning_rate": 4.242102781706742e-05, "loss": 0.1876, "num_input_tokens_seen": 3612896, "step": 3600 }, { "epoch": 1.6996699669966997, "grad_norm": 0.6376008987426758, "learning_rate": 4.247996228194248e-05, "loss": 0.1506, "num_input_tokens_seen": 3618336, "step": 3605 }, { "epoch": 1.702027345591702, "grad_norm": 1.0331735610961914, "learning_rate": 4.253889674681754e-05, "loss": 0.1067, "num_input_tokens_seen": 3622656, "step": 3610 }, { "epoch": 1.7043847241867045, "grad_norm": 0.5660312175750732, "learning_rate": 4.25978312116926e-05, "loss": 0.1703, "num_input_tokens_seen": 3628896, "step": 3615 }, { "epoch": 1.706742102781707, "grad_norm": 1.7451183795928955, "learning_rate": 4.265676567656766e-05, "loss": 0.1686, "num_input_tokens_seen": 3633952, "step": 3620 }, { "epoch": 1.709099481376709, "grad_norm": 0.8959898948669434, "learning_rate": 4.271570014144271e-05, "loss": 0.1043, "num_input_tokens_seen": 3638752, "step": 3625 }, { "epoch": 1.7114568599717115, "grad_norm": 0.7731111645698547, "learning_rate": 4.277463460631777e-05, "loss": 0.205, "num_input_tokens_seen": 3642912, "step": 3630 }, { "epoch": 1.7138142385667137, "grad_norm": 0.6796831488609314, "learning_rate": 4.283356907119284e-05, "loss": 0.1035, "num_input_tokens_seen": 3648352, "step": 3635 }, { "epoch": 1.716171617161716, "grad_norm": 0.3242971897125244, "learning_rate": 4.28925035360679e-05, "loss": 0.0915, "num_input_tokens_seen": 3652928, "step": 3640 }, { "epoch": 1.7185289957567185, "grad_norm": 2.6546757221221924, "learning_rate": 4.295143800094296e-05, "loss": 0.2806, "num_input_tokens_seen": 3658016, "step": 3645 }, { "epoch": 1.7208863743517209, "grad_norm": 0.6845149993896484, "learning_rate": 4.301037246581801e-05, "loss": 0.122, "num_input_tokens_seen": 3662400, "step": 3650 }, { "epoch": 1.7232437529467233, "grad_norm": 0.5565664768218994, "learning_rate": 4.306930693069307e-05, "loss": 0.2285, "num_input_tokens_seen": 3666400, "step": 3655 }, { "epoch": 1.7256011315417257, "grad_norm": 0.6263489127159119, "learning_rate": 4.312824139556813e-05, "loss": 0.1843, "num_input_tokens_seen": 3672128, "step": 3660 }, { "epoch": 1.727958510136728, "grad_norm": 1.6908668279647827, "learning_rate": 4.318717586044319e-05, "loss": 0.262, "num_input_tokens_seen": 3676576, "step": 3665 }, { "epoch": 1.7303158887317305, "grad_norm": 0.6792068481445312, "learning_rate": 4.324611032531825e-05, "loss": 0.1058, "num_input_tokens_seen": 3682240, "step": 3670 }, { "epoch": 1.7326732673267327, "grad_norm": 1.2555168867111206, "learning_rate": 4.33050447901933e-05, "loss": 0.3555, "num_input_tokens_seen": 3688128, "step": 3675 }, { "epoch": 1.735030645921735, "grad_norm": 1.1777746677398682, "learning_rate": 4.336397925506836e-05, "loss": 0.1095, "num_input_tokens_seen": 3693888, "step": 3680 }, { "epoch": 1.7373880245167372, "grad_norm": 0.7090325355529785, "learning_rate": 4.342291371994343e-05, "loss": 0.3682, "num_input_tokens_seen": 3698368, "step": 3685 }, { "epoch": 1.7397454031117396, "grad_norm": 1.1729696989059448, "learning_rate": 4.348184818481849e-05, "loss": 0.1722, "num_input_tokens_seen": 3703040, "step": 3690 }, { "epoch": 1.742102781706742, "grad_norm": 0.150470569729805, "learning_rate": 4.354078264969355e-05, "loss": 0.209, "num_input_tokens_seen": 3707616, "step": 3695 }, { "epoch": 1.7444601603017444, "grad_norm": 0.69554603099823, "learning_rate": 4.35997171145686e-05, "loss": 0.1809, "num_input_tokens_seen": 3712832, "step": 3700 }, { "epoch": 1.7468175388967468, "grad_norm": 0.6557025909423828, "learning_rate": 4.365865157944366e-05, "loss": 0.1949, "num_input_tokens_seen": 3717440, "step": 3705 }, { "epoch": 1.7491749174917492, "grad_norm": 2.5024161338806152, "learning_rate": 4.371758604431872e-05, "loss": 0.3181, "num_input_tokens_seen": 3722592, "step": 3710 }, { "epoch": 1.7515322960867516, "grad_norm": 0.6081795692443848, "learning_rate": 4.377652050919378e-05, "loss": 0.1315, "num_input_tokens_seen": 3727520, "step": 3715 }, { "epoch": 1.753889674681754, "grad_norm": 0.8376355767250061, "learning_rate": 4.383545497406884e-05, "loss": 0.1038, "num_input_tokens_seen": 3732096, "step": 3720 }, { "epoch": 1.7562470532767562, "grad_norm": 2.3318841457366943, "learning_rate": 4.389438943894389e-05, "loss": 0.2275, "num_input_tokens_seen": 3736480, "step": 3725 }, { "epoch": 1.7586044318717586, "grad_norm": 1.0651755332946777, "learning_rate": 4.395332390381895e-05, "loss": 0.197, "num_input_tokens_seen": 3741152, "step": 3730 }, { "epoch": 1.7609618104667608, "grad_norm": 2.3047893047332764, "learning_rate": 4.401225836869401e-05, "loss": 0.1707, "num_input_tokens_seen": 3746080, "step": 3735 }, { "epoch": 1.7633191890617632, "grad_norm": 0.39202478528022766, "learning_rate": 4.407119283356908e-05, "loss": 0.0941, "num_input_tokens_seen": 3751616, "step": 3740 }, { "epoch": 1.7656765676567656, "grad_norm": 0.3275199234485626, "learning_rate": 4.413012729844414e-05, "loss": 0.0605, "num_input_tokens_seen": 3757024, "step": 3745 }, { "epoch": 1.768033946251768, "grad_norm": 0.9779569506645203, "learning_rate": 4.418906176331919e-05, "loss": 0.0821, "num_input_tokens_seen": 3761344, "step": 3750 }, { "epoch": 1.7703913248467704, "grad_norm": 1.5263371467590332, "learning_rate": 4.424799622819425e-05, "loss": 0.1537, "num_input_tokens_seen": 3766336, "step": 3755 }, { "epoch": 1.7727487034417728, "grad_norm": 2.0377116203308105, "learning_rate": 4.430693069306931e-05, "loss": 0.2289, "num_input_tokens_seen": 3770560, "step": 3760 }, { "epoch": 1.7751060820367752, "grad_norm": 3.235076904296875, "learning_rate": 4.436586515794437e-05, "loss": 0.436, "num_input_tokens_seen": 3774592, "step": 3765 }, { "epoch": 1.7774634606317776, "grad_norm": 0.7950683236122131, "learning_rate": 4.442479962281943e-05, "loss": 0.27, "num_input_tokens_seen": 3780064, "step": 3770 }, { "epoch": 1.7798208392267798, "grad_norm": 1.0429608821868896, "learning_rate": 4.448373408769448e-05, "loss": 0.2418, "num_input_tokens_seen": 3784480, "step": 3775 }, { "epoch": 1.7821782178217822, "grad_norm": 0.20811982452869415, "learning_rate": 4.454266855256954e-05, "loss": 0.1639, "num_input_tokens_seen": 3791424, "step": 3780 }, { "epoch": 1.7845355964167844, "grad_norm": 1.7107959985733032, "learning_rate": 4.46016030174446e-05, "loss": 0.1755, "num_input_tokens_seen": 3796736, "step": 3785 }, { "epoch": 1.7868929750117868, "grad_norm": 4.230134963989258, "learning_rate": 4.466053748231967e-05, "loss": 0.3421, "num_input_tokens_seen": 3802016, "step": 3790 }, { "epoch": 1.7892503536067892, "grad_norm": 1.0340715646743774, "learning_rate": 4.471947194719473e-05, "loss": 0.0958, "num_input_tokens_seen": 3806848, "step": 3795 }, { "epoch": 1.7916077322017916, "grad_norm": 1.1720647811889648, "learning_rate": 4.477840641206978e-05, "loss": 0.1774, "num_input_tokens_seen": 3812160, "step": 3800 }, { "epoch": 1.793965110796794, "grad_norm": 0.8077684640884399, "learning_rate": 4.483734087694484e-05, "loss": 0.1648, "num_input_tokens_seen": 3817184, "step": 3805 }, { "epoch": 1.7963224893917964, "grad_norm": 3.253488302230835, "learning_rate": 4.48962753418199e-05, "loss": 0.1884, "num_input_tokens_seen": 3822848, "step": 3810 }, { "epoch": 1.7986798679867988, "grad_norm": 0.08249658346176147, "learning_rate": 4.495520980669496e-05, "loss": 0.2533, "num_input_tokens_seen": 3827296, "step": 3815 }, { "epoch": 1.8010372465818012, "grad_norm": 1.0758930444717407, "learning_rate": 4.501414427157002e-05, "loss": 0.1609, "num_input_tokens_seen": 3831616, "step": 3820 }, { "epoch": 1.8033946251768034, "grad_norm": 1.205527901649475, "learning_rate": 4.507307873644507e-05, "loss": 0.1471, "num_input_tokens_seen": 3836480, "step": 3825 }, { "epoch": 1.8057520037718058, "grad_norm": 1.1889359951019287, "learning_rate": 4.513201320132013e-05, "loss": 0.1063, "num_input_tokens_seen": 3841920, "step": 3830 }, { "epoch": 1.808109382366808, "grad_norm": 0.04660703241825104, "learning_rate": 4.519094766619519e-05, "loss": 0.3338, "num_input_tokens_seen": 3847968, "step": 3835 }, { "epoch": 1.8104667609618104, "grad_norm": 0.2915787100791931, "learning_rate": 4.524988213107026e-05, "loss": 0.1257, "num_input_tokens_seen": 3852736, "step": 3840 }, { "epoch": 1.8128241395568128, "grad_norm": 1.9051227569580078, "learning_rate": 4.530881659594532e-05, "loss": 0.2372, "num_input_tokens_seen": 3857824, "step": 3845 }, { "epoch": 1.8151815181518152, "grad_norm": 0.6314691305160522, "learning_rate": 4.536775106082037e-05, "loss": 0.142, "num_input_tokens_seen": 3864256, "step": 3850 }, { "epoch": 1.8175388967468176, "grad_norm": 0.6404446363449097, "learning_rate": 4.542668552569543e-05, "loss": 0.1228, "num_input_tokens_seen": 3869920, "step": 3855 }, { "epoch": 1.81989627534182, "grad_norm": 1.1334192752838135, "learning_rate": 4.548561999057049e-05, "loss": 0.1373, "num_input_tokens_seen": 3876128, "step": 3860 }, { "epoch": 1.8222536539368224, "grad_norm": 0.321017324924469, "learning_rate": 4.554455445544555e-05, "loss": 0.1774, "num_input_tokens_seen": 3880896, "step": 3865 }, { "epoch": 1.8246110325318248, "grad_norm": 0.590065062046051, "learning_rate": 4.560348892032061e-05, "loss": 0.2223, "num_input_tokens_seen": 3885504, "step": 3870 }, { "epoch": 1.826968411126827, "grad_norm": 1.2204855680465698, "learning_rate": 4.566242338519566e-05, "loss": 0.1457, "num_input_tokens_seen": 3890048, "step": 3875 }, { "epoch": 1.8293257897218294, "grad_norm": 0.11513256281614304, "learning_rate": 4.572135785007072e-05, "loss": 0.053, "num_input_tokens_seen": 3895296, "step": 3880 }, { "epoch": 1.8316831683168315, "grad_norm": 0.49988076090812683, "learning_rate": 4.578029231494578e-05, "loss": 0.1475, "num_input_tokens_seen": 3899808, "step": 3885 }, { "epoch": 1.834040546911834, "grad_norm": 1.0625454187393188, "learning_rate": 4.583922677982084e-05, "loss": 0.2721, "num_input_tokens_seen": 3905760, "step": 3890 }, { "epoch": 1.8363979255068363, "grad_norm": 1.9107604026794434, "learning_rate": 4.5898161244695906e-05, "loss": 0.3463, "num_input_tokens_seen": 3912160, "step": 3895 }, { "epoch": 1.8387553041018387, "grad_norm": 0.9513694643974304, "learning_rate": 4.595709570957096e-05, "loss": 0.1597, "num_input_tokens_seen": 3917120, "step": 3900 }, { "epoch": 1.8411126826968411, "grad_norm": 0.6985296607017517, "learning_rate": 4.601603017444602e-05, "loss": 0.072, "num_input_tokens_seen": 3922752, "step": 3905 }, { "epoch": 1.8434700612918435, "grad_norm": 0.5059312582015991, "learning_rate": 4.607496463932108e-05, "loss": 0.2213, "num_input_tokens_seen": 3927840, "step": 3910 }, { "epoch": 1.845827439886846, "grad_norm": 2.4979069232940674, "learning_rate": 4.613389910419614e-05, "loss": 0.3436, "num_input_tokens_seen": 3932864, "step": 3915 }, { "epoch": 1.8481848184818483, "grad_norm": 3.067866802215576, "learning_rate": 4.61928335690712e-05, "loss": 0.327, "num_input_tokens_seen": 3938016, "step": 3920 }, { "epoch": 1.8505421970768505, "grad_norm": 0.21624217927455902, "learning_rate": 4.625176803394625e-05, "loss": 0.2611, "num_input_tokens_seen": 3944960, "step": 3925 }, { "epoch": 1.852899575671853, "grad_norm": 0.549531877040863, "learning_rate": 4.631070249882131e-05, "loss": 0.1393, "num_input_tokens_seen": 3949760, "step": 3930 }, { "epoch": 1.855256954266855, "grad_norm": 0.33397921919822693, "learning_rate": 4.636963696369637e-05, "loss": 0.0452, "num_input_tokens_seen": 3954656, "step": 3935 }, { "epoch": 1.8576143328618575, "grad_norm": 0.6960455775260925, "learning_rate": 4.642857142857143e-05, "loss": 0.1239, "num_input_tokens_seen": 3959136, "step": 3940 }, { "epoch": 1.85997171145686, "grad_norm": 0.3262898027896881, "learning_rate": 4.6487505893446496e-05, "loss": 0.1074, "num_input_tokens_seen": 3963776, "step": 3945 }, { "epoch": 1.8623290900518623, "grad_norm": 0.6710801720619202, "learning_rate": 4.654644035832155e-05, "loss": 0.3, "num_input_tokens_seen": 3968896, "step": 3950 }, { "epoch": 1.8646864686468647, "grad_norm": 0.22493486106395721, "learning_rate": 4.660537482319661e-05, "loss": 0.0706, "num_input_tokens_seen": 3973824, "step": 3955 }, { "epoch": 1.8670438472418671, "grad_norm": 0.10904567688703537, "learning_rate": 4.666430928807167e-05, "loss": 0.0748, "num_input_tokens_seen": 3979776, "step": 3960 }, { "epoch": 1.8694012258368695, "grad_norm": 2.624328136444092, "learning_rate": 4.672324375294673e-05, "loss": 0.2252, "num_input_tokens_seen": 3985344, "step": 3965 }, { "epoch": 1.871758604431872, "grad_norm": 0.1784209907054901, "learning_rate": 4.678217821782179e-05, "loss": 0.1499, "num_input_tokens_seen": 3990144, "step": 3970 }, { "epoch": 1.874115983026874, "grad_norm": 0.2564029097557068, "learning_rate": 4.684111268269684e-05, "loss": 0.2843, "num_input_tokens_seen": 3994464, "step": 3975 }, { "epoch": 1.8764733616218765, "grad_norm": 0.8863968849182129, "learning_rate": 4.69000471475719e-05, "loss": 0.0721, "num_input_tokens_seen": 3999808, "step": 3980 }, { "epoch": 1.8788307402168787, "grad_norm": 0.42346376180648804, "learning_rate": 4.695898161244696e-05, "loss": 0.1247, "num_input_tokens_seen": 4004032, "step": 3985 }, { "epoch": 1.881188118811881, "grad_norm": 0.8516708016395569, "learning_rate": 4.701791607732202e-05, "loss": 0.169, "num_input_tokens_seen": 4008960, "step": 3990 }, { "epoch": 1.8835454974068835, "grad_norm": 0.22653353214263916, "learning_rate": 4.707685054219708e-05, "loss": 0.1067, "num_input_tokens_seen": 4013120, "step": 3995 }, { "epoch": 1.8859028760018859, "grad_norm": 2.1807477474212646, "learning_rate": 4.713578500707214e-05, "loss": 0.2095, "num_input_tokens_seen": 4017696, "step": 4000 }, { "epoch": 1.8882602545968883, "grad_norm": 0.5872830748558044, "learning_rate": 4.71947194719472e-05, "loss": 0.2541, "num_input_tokens_seen": 4022464, "step": 4005 }, { "epoch": 1.8906176331918907, "grad_norm": 2.4725522994995117, "learning_rate": 4.725365393682226e-05, "loss": 0.2387, "num_input_tokens_seen": 4026912, "step": 4010 }, { "epoch": 1.892975011786893, "grad_norm": 1.6731507778167725, "learning_rate": 4.731258840169732e-05, "loss": 0.4184, "num_input_tokens_seen": 4032000, "step": 4015 }, { "epoch": 1.8953323903818955, "grad_norm": 4.202366828918457, "learning_rate": 4.737152286657238e-05, "loss": 0.4128, "num_input_tokens_seen": 4036608, "step": 4020 }, { "epoch": 1.8976897689768977, "grad_norm": 0.5259521007537842, "learning_rate": 4.743045733144743e-05, "loss": 0.1142, "num_input_tokens_seen": 4041280, "step": 4025 }, { "epoch": 1.9000471475719, "grad_norm": 1.6976982355117798, "learning_rate": 4.748939179632249e-05, "loss": 0.1576, "num_input_tokens_seen": 4047520, "step": 4030 }, { "epoch": 1.9024045261669023, "grad_norm": 0.06669851392507553, "learning_rate": 4.754832626119755e-05, "loss": 0.1118, "num_input_tokens_seen": 4053696, "step": 4035 }, { "epoch": 1.9047619047619047, "grad_norm": 0.7570860981941223, "learning_rate": 4.760726072607261e-05, "loss": 0.1884, "num_input_tokens_seen": 4057888, "step": 4040 }, { "epoch": 1.907119283356907, "grad_norm": 3.2767562866210938, "learning_rate": 4.766619519094767e-05, "loss": 0.2492, "num_input_tokens_seen": 4063520, "step": 4045 }, { "epoch": 1.9094766619519095, "grad_norm": 0.7543337345123291, "learning_rate": 4.772512965582273e-05, "loss": 0.0893, "num_input_tokens_seen": 4068384, "step": 4050 }, { "epoch": 1.9118340405469119, "grad_norm": 0.5019770264625549, "learning_rate": 4.778406412069779e-05, "loss": 0.0781, "num_input_tokens_seen": 4072768, "step": 4055 }, { "epoch": 1.9141914191419143, "grad_norm": 1.2894998788833618, "learning_rate": 4.784299858557285e-05, "loss": 0.1648, "num_input_tokens_seen": 4078400, "step": 4060 }, { "epoch": 1.9165487977369167, "grad_norm": 0.8702859878540039, "learning_rate": 4.7901933050447907e-05, "loss": 0.1965, "num_input_tokens_seen": 4083168, "step": 4065 }, { "epoch": 1.918906176331919, "grad_norm": 0.6565982103347778, "learning_rate": 4.7960867515322966e-05, "loss": 0.22, "num_input_tokens_seen": 4087904, "step": 4070 }, { "epoch": 1.9212635549269212, "grad_norm": 0.304512619972229, "learning_rate": 4.801980198019802e-05, "loss": 0.2343, "num_input_tokens_seen": 4092704, "step": 4075 }, { "epoch": 1.9236209335219236, "grad_norm": 1.6304144859313965, "learning_rate": 4.807873644507308e-05, "loss": 0.2084, "num_input_tokens_seen": 4097984, "step": 4080 }, { "epoch": 1.9259783121169258, "grad_norm": 0.896079957485199, "learning_rate": 4.813767090994814e-05, "loss": 0.1266, "num_input_tokens_seen": 4102976, "step": 4085 }, { "epoch": 1.9283356907119282, "grad_norm": 2.191800355911255, "learning_rate": 4.81966053748232e-05, "loss": 0.2855, "num_input_tokens_seen": 4108896, "step": 4090 }, { "epoch": 1.9306930693069306, "grad_norm": 2.717115640640259, "learning_rate": 4.825553983969826e-05, "loss": 0.3036, "num_input_tokens_seen": 4114336, "step": 4095 }, { "epoch": 1.933050447901933, "grad_norm": 1.461082100868225, "learning_rate": 4.831447430457332e-05, "loss": 0.2837, "num_input_tokens_seen": 4118944, "step": 4100 }, { "epoch": 1.9354078264969354, "grad_norm": 0.21483205258846283, "learning_rate": 4.837340876944838e-05, "loss": 0.335, "num_input_tokens_seen": 4123968, "step": 4105 }, { "epoch": 1.9377652050919378, "grad_norm": 1.1976476907730103, "learning_rate": 4.8432343234323437e-05, "loss": 0.3248, "num_input_tokens_seen": 4129120, "step": 4110 }, { "epoch": 1.9401225836869402, "grad_norm": 5.7917680740356445, "learning_rate": 4.8491277699198496e-05, "loss": 0.434, "num_input_tokens_seen": 4133536, "step": 4115 }, { "epoch": 1.9424799622819426, "grad_norm": 2.2298519611358643, "learning_rate": 4.8550212164073556e-05, "loss": 0.1755, "num_input_tokens_seen": 4138208, "step": 4120 }, { "epoch": 1.9448373408769448, "grad_norm": 0.2364039570093155, "learning_rate": 4.860914662894861e-05, "loss": 0.1332, "num_input_tokens_seen": 4142656, "step": 4125 }, { "epoch": 1.9471947194719472, "grad_norm": 0.9321277737617493, "learning_rate": 4.866808109382367e-05, "loss": 0.0874, "num_input_tokens_seen": 4148224, "step": 4130 }, { "epoch": 1.9495520980669494, "grad_norm": 0.28196609020233154, "learning_rate": 4.872701555869873e-05, "loss": 0.0573, "num_input_tokens_seen": 4152800, "step": 4135 }, { "epoch": 1.9519094766619518, "grad_norm": 0.9414814114570618, "learning_rate": 4.878595002357379e-05, "loss": 0.0535, "num_input_tokens_seen": 4160608, "step": 4140 }, { "epoch": 1.9542668552569542, "grad_norm": 1.5075552463531494, "learning_rate": 4.884488448844885e-05, "loss": 0.1817, "num_input_tokens_seen": 4165056, "step": 4145 }, { "epoch": 1.9566242338519566, "grad_norm": 0.4410640001296997, "learning_rate": 4.890381895332391e-05, "loss": 0.1462, "num_input_tokens_seen": 4170048, "step": 4150 }, { "epoch": 1.958981612446959, "grad_norm": 1.0569517612457275, "learning_rate": 4.8962753418198966e-05, "loss": 0.1509, "num_input_tokens_seen": 4175520, "step": 4155 }, { "epoch": 1.9613389910419614, "grad_norm": 0.7360678315162659, "learning_rate": 4.9021687883074026e-05, "loss": 0.4019, "num_input_tokens_seen": 4182176, "step": 4160 }, { "epoch": 1.9636963696369638, "grad_norm": 0.9358797669410706, "learning_rate": 4.9080622347949086e-05, "loss": 0.1826, "num_input_tokens_seen": 4187552, "step": 4165 }, { "epoch": 1.9660537482319662, "grad_norm": 2.088259696960449, "learning_rate": 4.9139556812824145e-05, "loss": 0.2342, "num_input_tokens_seen": 4193056, "step": 4170 }, { "epoch": 1.9684111268269684, "grad_norm": 1.4292768239974976, "learning_rate": 4.91984912776992e-05, "loss": 0.2252, "num_input_tokens_seen": 4197792, "step": 4175 }, { "epoch": 1.9707685054219708, "grad_norm": 0.42843595147132874, "learning_rate": 4.925742574257426e-05, "loss": 0.1708, "num_input_tokens_seen": 4201984, "step": 4180 }, { "epoch": 1.973125884016973, "grad_norm": 2.252049207687378, "learning_rate": 4.931636020744932e-05, "loss": 0.1578, "num_input_tokens_seen": 4208000, "step": 4185 }, { "epoch": 1.9754832626119754, "grad_norm": 0.8642746210098267, "learning_rate": 4.937529467232438e-05, "loss": 0.1405, "num_input_tokens_seen": 4213056, "step": 4190 }, { "epoch": 1.9778406412069778, "grad_norm": 3.5519566535949707, "learning_rate": 4.943422913719944e-05, "loss": 0.3588, "num_input_tokens_seen": 4217952, "step": 4195 }, { "epoch": 1.9801980198019802, "grad_norm": 0.6495652198791504, "learning_rate": 4.9493163602074496e-05, "loss": 0.0854, "num_input_tokens_seen": 4222336, "step": 4200 }, { "epoch": 1.9825553983969826, "grad_norm": 0.8896437883377075, "learning_rate": 4.9552098066949556e-05, "loss": 0.1082, "num_input_tokens_seen": 4227840, "step": 4205 }, { "epoch": 1.984912776991985, "grad_norm": 1.8411375284194946, "learning_rate": 4.9611032531824616e-05, "loss": 0.1998, "num_input_tokens_seen": 4233280, "step": 4210 }, { "epoch": 1.9872701555869874, "grad_norm": 0.3128986358642578, "learning_rate": 4.9669966996699675e-05, "loss": 0.1522, "num_input_tokens_seen": 4237280, "step": 4215 }, { "epoch": 1.9896275341819898, "grad_norm": 0.13538140058517456, "learning_rate": 4.9728901461574735e-05, "loss": 0.1116, "num_input_tokens_seen": 4242016, "step": 4220 }, { "epoch": 1.991984912776992, "grad_norm": 1.5477361679077148, "learning_rate": 4.978783592644979e-05, "loss": 0.2507, "num_input_tokens_seen": 4247008, "step": 4225 }, { "epoch": 1.9943422913719944, "grad_norm": 0.6279570460319519, "learning_rate": 4.984677039132485e-05, "loss": 0.2106, "num_input_tokens_seen": 4252768, "step": 4230 }, { "epoch": 1.9966996699669965, "grad_norm": 0.422994464635849, "learning_rate": 4.990570485619991e-05, "loss": 0.2623, "num_input_tokens_seen": 4257568, "step": 4235 }, { "epoch": 1.999057048561999, "grad_norm": 0.22970350086688995, "learning_rate": 4.996463932107497e-05, "loss": 0.1636, "num_input_tokens_seen": 4262496, "step": 4240 }, { "epoch": 2.0, "eval_loss": 0.18287692964076996, "eval_runtime": 15.1476, "eval_samples_per_second": 62.254, "eval_steps_per_second": 15.58, "num_input_tokens_seen": 4264768, "step": 4242 }, { "epoch": 2.0014144271570014, "grad_norm": 0.63239586353302, "learning_rate": 4.999999966143396e-05, "loss": 0.1177, "num_input_tokens_seen": 4267584, "step": 4245 }, { "epoch": 2.0037718057520038, "grad_norm": 1.1613831520080566, "learning_rate": 4.9999995852566046e-05, "loss": 0.1442, "num_input_tokens_seen": 4272640, "step": 4250 }, { "epoch": 2.006129184347006, "grad_norm": 2.164857864379883, "learning_rate": 4.999998781162331e-05, "loss": 0.1178, "num_input_tokens_seen": 4278080, "step": 4255 }, { "epoch": 2.0084865629420086, "grad_norm": 1.3294209241867065, "learning_rate": 4.999997553860712e-05, "loss": 0.1881, "num_input_tokens_seen": 4282496, "step": 4260 }, { "epoch": 2.010843941537011, "grad_norm": 0.5552541613578796, "learning_rate": 4.999995903351954e-05, "loss": 0.2509, "num_input_tokens_seen": 4287456, "step": 4265 }, { "epoch": 2.0132013201320134, "grad_norm": 0.4306597113609314, "learning_rate": 4.999993829636337e-05, "loss": 0.0696, "num_input_tokens_seen": 4292576, "step": 4270 }, { "epoch": 2.0155586987270158, "grad_norm": 1.236284613609314, "learning_rate": 4.999991332714212e-05, "loss": 0.1788, "num_input_tokens_seen": 4297600, "step": 4275 }, { "epoch": 2.0179160773220177, "grad_norm": 0.5416891574859619, "learning_rate": 4.999988412586003e-05, "loss": 0.0787, "num_input_tokens_seen": 4303744, "step": 4280 }, { "epoch": 2.02027345591702, "grad_norm": 0.901317298412323, "learning_rate": 4.999985069252202e-05, "loss": 0.1922, "num_input_tokens_seen": 4309312, "step": 4285 }, { "epoch": 2.0226308345120225, "grad_norm": 0.703439474105835, "learning_rate": 4.999981302713377e-05, "loss": 0.1598, "num_input_tokens_seen": 4314496, "step": 4290 }, { "epoch": 2.024988213107025, "grad_norm": 0.9123945236206055, "learning_rate": 4.999977112970164e-05, "loss": 0.1732, "num_input_tokens_seen": 4319552, "step": 4295 }, { "epoch": 2.0273455917020273, "grad_norm": 3.3112032413482666, "learning_rate": 4.9999725000232734e-05, "loss": 0.2166, "num_input_tokens_seen": 4324448, "step": 4300 }, { "epoch": 2.0297029702970297, "grad_norm": 0.31541693210601807, "learning_rate": 4.999967463873485e-05, "loss": 0.2296, "num_input_tokens_seen": 4329760, "step": 4305 }, { "epoch": 2.032060348892032, "grad_norm": 0.321872353553772, "learning_rate": 4.999962004521652e-05, "loss": 0.1161, "num_input_tokens_seen": 4334880, "step": 4310 }, { "epoch": 2.0344177274870345, "grad_norm": 1.5491257905960083, "learning_rate": 4.9999561219686995e-05, "loss": 0.1988, "num_input_tokens_seen": 4340096, "step": 4315 }, { "epoch": 2.036775106082037, "grad_norm": 0.17604507505893707, "learning_rate": 4.999949816215622e-05, "loss": 0.0612, "num_input_tokens_seen": 4345344, "step": 4320 }, { "epoch": 2.0391324846770393, "grad_norm": 1.1408127546310425, "learning_rate": 4.999943087263487e-05, "loss": 0.406, "num_input_tokens_seen": 4351360, "step": 4325 }, { "epoch": 2.0414898632720413, "grad_norm": 0.4904976189136505, "learning_rate": 4.9999359351134346e-05, "loss": 0.1697, "num_input_tokens_seen": 4356192, "step": 4330 }, { "epoch": 2.0438472418670437, "grad_norm": 0.1986754983663559, "learning_rate": 4.999928359766674e-05, "loss": 0.1303, "num_input_tokens_seen": 4362496, "step": 4335 }, { "epoch": 2.046204620462046, "grad_norm": 2.4270427227020264, "learning_rate": 4.9999203612244896e-05, "loss": 0.173, "num_input_tokens_seen": 4366752, "step": 4340 }, { "epoch": 2.0485619990570485, "grad_norm": 0.6227543354034424, "learning_rate": 4.9999119394882334e-05, "loss": 0.0573, "num_input_tokens_seen": 4371648, "step": 4345 }, { "epoch": 2.050919377652051, "grad_norm": 0.44962525367736816, "learning_rate": 4.999903094559332e-05, "loss": 0.2652, "num_input_tokens_seen": 4376864, "step": 4350 }, { "epoch": 2.0532767562470533, "grad_norm": 1.412336826324463, "learning_rate": 4.999893826439283e-05, "loss": 0.1949, "num_input_tokens_seen": 4381408, "step": 4355 }, { "epoch": 2.0556341348420557, "grad_norm": 2.1435821056365967, "learning_rate": 4.9998841351296546e-05, "loss": 0.2849, "num_input_tokens_seen": 4385888, "step": 4360 }, { "epoch": 2.057991513437058, "grad_norm": 0.05592789500951767, "learning_rate": 4.999874020632087e-05, "loss": 0.1308, "num_input_tokens_seen": 4390432, "step": 4365 }, { "epoch": 2.0603488920320605, "grad_norm": 0.06917336583137512, "learning_rate": 4.9998634829482945e-05, "loss": 0.2624, "num_input_tokens_seen": 4395296, "step": 4370 }, { "epoch": 2.062706270627063, "grad_norm": 2.597736358642578, "learning_rate": 4.999852522080059e-05, "loss": 0.4059, "num_input_tokens_seen": 4400192, "step": 4375 }, { "epoch": 2.065063649222065, "grad_norm": 0.38748106360435486, "learning_rate": 4.9998411380292364e-05, "loss": 0.1495, "num_input_tokens_seen": 4405632, "step": 4380 }, { "epoch": 2.0674210278170673, "grad_norm": 0.9783794283866882, "learning_rate": 4.999829330797755e-05, "loss": 0.0704, "num_input_tokens_seen": 4410656, "step": 4385 }, { "epoch": 2.0697784064120697, "grad_norm": 1.5590810775756836, "learning_rate": 4.999817100387612e-05, "loss": 0.1198, "num_input_tokens_seen": 4415680, "step": 4390 }, { "epoch": 2.072135785007072, "grad_norm": 0.3546421229839325, "learning_rate": 4.9998044468008775e-05, "loss": 0.1239, "num_input_tokens_seen": 4421024, "step": 4395 }, { "epoch": 2.0744931636020745, "grad_norm": 0.5983394384384155, "learning_rate": 4.999791370039696e-05, "loss": 0.2587, "num_input_tokens_seen": 4425376, "step": 4400 }, { "epoch": 2.076850542197077, "grad_norm": 0.7397473454475403, "learning_rate": 4.999777870106278e-05, "loss": 0.1921, "num_input_tokens_seen": 4430304, "step": 4405 }, { "epoch": 2.0792079207920793, "grad_norm": 0.8352524042129517, "learning_rate": 4.999763947002911e-05, "loss": 0.1604, "num_input_tokens_seen": 4435520, "step": 4410 }, { "epoch": 2.0815652993870817, "grad_norm": 1.266044020652771, "learning_rate": 4.999749600731952e-05, "loss": 0.2568, "num_input_tokens_seen": 4440256, "step": 4415 }, { "epoch": 2.083922677982084, "grad_norm": 0.5033084154129028, "learning_rate": 4.9997348312958285e-05, "loss": 0.1545, "num_input_tokens_seen": 4445152, "step": 4420 }, { "epoch": 2.0862800565770865, "grad_norm": 1.0447155237197876, "learning_rate": 4.9997196386970416e-05, "loss": 0.2303, "num_input_tokens_seen": 4449632, "step": 4425 }, { "epoch": 2.0886374351720884, "grad_norm": 0.2818651497364044, "learning_rate": 4.9997040229381616e-05, "loss": 0.11, "num_input_tokens_seen": 4455136, "step": 4430 }, { "epoch": 2.090994813767091, "grad_norm": 0.15917536616325378, "learning_rate": 4.9996879840218335e-05, "loss": 0.0555, "num_input_tokens_seen": 4459808, "step": 4435 }, { "epoch": 2.0933521923620932, "grad_norm": 0.7192989587783813, "learning_rate": 4.999671521950773e-05, "loss": 0.2321, "num_input_tokens_seen": 4466080, "step": 4440 }, { "epoch": 2.0957095709570956, "grad_norm": 0.569118857383728, "learning_rate": 4.999654636727764e-05, "loss": 0.2017, "num_input_tokens_seen": 4473184, "step": 4445 }, { "epoch": 2.098066949552098, "grad_norm": 0.26554203033447266, "learning_rate": 4.999637328355669e-05, "loss": 0.1929, "num_input_tokens_seen": 4477824, "step": 4450 }, { "epoch": 2.1004243281471005, "grad_norm": 1.1966660022735596, "learning_rate": 4.999619596837414e-05, "loss": 0.1705, "num_input_tokens_seen": 4482624, "step": 4455 }, { "epoch": 2.102781706742103, "grad_norm": 0.09204025566577911, "learning_rate": 4.999601442176003e-05, "loss": 0.2294, "num_input_tokens_seen": 4487488, "step": 4460 }, { "epoch": 2.1051390853371053, "grad_norm": 2.038649797439575, "learning_rate": 4.999582864374508e-05, "loss": 0.1349, "num_input_tokens_seen": 4492768, "step": 4465 }, { "epoch": 2.1074964639321077, "grad_norm": 1.8310467004776, "learning_rate": 4.999563863436075e-05, "loss": 0.4007, "num_input_tokens_seen": 4498208, "step": 4470 }, { "epoch": 2.10985384252711, "grad_norm": 0.46989187598228455, "learning_rate": 4.99954443936392e-05, "loss": 0.2358, "num_input_tokens_seen": 4502880, "step": 4475 }, { "epoch": 2.112211221122112, "grad_norm": 1.3749052286148071, "learning_rate": 4.999524592161332e-05, "loss": 0.1194, "num_input_tokens_seen": 4508032, "step": 4480 }, { "epoch": 2.1145685997171144, "grad_norm": 0.8055626153945923, "learning_rate": 4.9995043218316694e-05, "loss": 0.185, "num_input_tokens_seen": 4513568, "step": 4485 }, { "epoch": 2.116925978312117, "grad_norm": 0.5235068202018738, "learning_rate": 4.999483628378364e-05, "loss": 0.221, "num_input_tokens_seen": 4518976, "step": 4490 }, { "epoch": 2.119283356907119, "grad_norm": 2.003682851791382, "learning_rate": 4.99946251180492e-05, "loss": 0.143, "num_input_tokens_seen": 4522912, "step": 4495 }, { "epoch": 2.1216407355021216, "grad_norm": 1.6895294189453125, "learning_rate": 4.999440972114911e-05, "loss": 0.1215, "num_input_tokens_seen": 4527520, "step": 4500 }, { "epoch": 2.123998114097124, "grad_norm": 0.5688002705574036, "learning_rate": 4.999419009311983e-05, "loss": 0.0553, "num_input_tokens_seen": 4532416, "step": 4505 }, { "epoch": 2.1263554926921264, "grad_norm": 0.9523225426673889, "learning_rate": 4.999396623399855e-05, "loss": 0.2425, "num_input_tokens_seen": 4537952, "step": 4510 }, { "epoch": 2.128712871287129, "grad_norm": 0.47958648204803467, "learning_rate": 4.999373814382315e-05, "loss": 0.0592, "num_input_tokens_seen": 4542528, "step": 4515 }, { "epoch": 2.1310702498821312, "grad_norm": 0.1794375628232956, "learning_rate": 4.999350582263226e-05, "loss": 0.1042, "num_input_tokens_seen": 4547008, "step": 4520 }, { "epoch": 2.1334276284771336, "grad_norm": 0.2842442989349365, "learning_rate": 4.99932692704652e-05, "loss": 0.2061, "num_input_tokens_seen": 4551680, "step": 4525 }, { "epoch": 2.1357850070721356, "grad_norm": 1.563781499862671, "learning_rate": 4.9993028487362006e-05, "loss": 0.2424, "num_input_tokens_seen": 4557216, "step": 4530 }, { "epoch": 2.138142385667138, "grad_norm": 1.08412504196167, "learning_rate": 4.9992783473363455e-05, "loss": 0.1709, "num_input_tokens_seen": 4562464, "step": 4535 }, { "epoch": 2.1404997642621404, "grad_norm": 1.059828281402588, "learning_rate": 4.999253422851101e-05, "loss": 0.1855, "num_input_tokens_seen": 4567776, "step": 4540 }, { "epoch": 2.142857142857143, "grad_norm": 1.6149119138717651, "learning_rate": 4.999228075284688e-05, "loss": 0.2982, "num_input_tokens_seen": 4573152, "step": 4545 }, { "epoch": 2.145214521452145, "grad_norm": 3.0906624794006348, "learning_rate": 4.999202304641395e-05, "loss": 0.3138, "num_input_tokens_seen": 4578048, "step": 4550 }, { "epoch": 2.1475719000471476, "grad_norm": 0.1806046962738037, "learning_rate": 4.9991761109255864e-05, "loss": 0.1863, "num_input_tokens_seen": 4582336, "step": 4555 }, { "epoch": 2.14992927864215, "grad_norm": 0.7850853204727173, "learning_rate": 4.999149494141696e-05, "loss": 0.0781, "num_input_tokens_seen": 4587360, "step": 4560 }, { "epoch": 2.1522866572371524, "grad_norm": 1.9433988332748413, "learning_rate": 4.9991224542942294e-05, "loss": 0.3029, "num_input_tokens_seen": 4591872, "step": 4565 }, { "epoch": 2.154644035832155, "grad_norm": 0.8003458380699158, "learning_rate": 4.999094991387764e-05, "loss": 0.1447, "num_input_tokens_seen": 4596640, "step": 4570 }, { "epoch": 2.157001414427157, "grad_norm": 2.000875234603882, "learning_rate": 4.9990671054269485e-05, "loss": 0.1378, "num_input_tokens_seen": 4600672, "step": 4575 }, { "epoch": 2.159358793022159, "grad_norm": 1.4425963163375854, "learning_rate": 4.999038796416503e-05, "loss": 0.1404, "num_input_tokens_seen": 4606080, "step": 4580 }, { "epoch": 2.1617161716171616, "grad_norm": 0.6689937710762024, "learning_rate": 4.9990100643612226e-05, "loss": 0.2588, "num_input_tokens_seen": 4610816, "step": 4585 }, { "epoch": 2.164073550212164, "grad_norm": 0.20753052830696106, "learning_rate": 4.998980909265967e-05, "loss": 0.1606, "num_input_tokens_seen": 4615968, "step": 4590 }, { "epoch": 2.1664309288071664, "grad_norm": 1.3502644300460815, "learning_rate": 4.998951331135675e-05, "loss": 0.2086, "num_input_tokens_seen": 4620288, "step": 4595 }, { "epoch": 2.1687883074021688, "grad_norm": 0.7847273945808411, "learning_rate": 4.998921329975352e-05, "loss": 0.2687, "num_input_tokens_seen": 4625568, "step": 4600 }, { "epoch": 2.171145685997171, "grad_norm": 0.574410080909729, "learning_rate": 4.9988909057900777e-05, "loss": 0.3419, "num_input_tokens_seen": 4630560, "step": 4605 }, { "epoch": 2.1735030645921736, "grad_norm": 0.7888584733009338, "learning_rate": 4.998860058585001e-05, "loss": 0.0989, "num_input_tokens_seen": 4636192, "step": 4610 }, { "epoch": 2.175860443187176, "grad_norm": 0.5788165330886841, "learning_rate": 4.998828788365344e-05, "loss": 0.1, "num_input_tokens_seen": 4641344, "step": 4615 }, { "epoch": 2.1782178217821784, "grad_norm": 0.8480572700500488, "learning_rate": 4.998797095136403e-05, "loss": 0.1575, "num_input_tokens_seen": 4646528, "step": 4620 }, { "epoch": 2.1805752003771808, "grad_norm": 0.12160918116569519, "learning_rate": 4.998764978903539e-05, "loss": 0.0995, "num_input_tokens_seen": 4651616, "step": 4625 }, { "epoch": 2.1829325789721827, "grad_norm": 0.6347771286964417, "learning_rate": 4.9987324396721915e-05, "loss": 0.0902, "num_input_tokens_seen": 4656672, "step": 4630 }, { "epoch": 2.185289957567185, "grad_norm": 0.08421700447797775, "learning_rate": 4.998699477447868e-05, "loss": 0.2266, "num_input_tokens_seen": 4662144, "step": 4635 }, { "epoch": 2.1876473361621875, "grad_norm": 1.154948353767395, "learning_rate": 4.998666092236148e-05, "loss": 0.3005, "num_input_tokens_seen": 4667488, "step": 4640 }, { "epoch": 2.19000471475719, "grad_norm": 2.191789150238037, "learning_rate": 4.9986322840426835e-05, "loss": 0.107, "num_input_tokens_seen": 4672480, "step": 4645 }, { "epoch": 2.1923620933521923, "grad_norm": 1.0261383056640625, "learning_rate": 4.9985980528731976e-05, "loss": 0.2422, "num_input_tokens_seen": 4677024, "step": 4650 }, { "epoch": 2.1947194719471947, "grad_norm": 0.48352017998695374, "learning_rate": 4.998563398733486e-05, "loss": 0.318, "num_input_tokens_seen": 4681728, "step": 4655 }, { "epoch": 2.197076850542197, "grad_norm": 1.4572006464004517, "learning_rate": 4.998528321629414e-05, "loss": 0.1489, "num_input_tokens_seen": 4686016, "step": 4660 }, { "epoch": 2.1994342291371995, "grad_norm": 0.2938033938407898, "learning_rate": 4.998492821566919e-05, "loss": 0.0489, "num_input_tokens_seen": 4690240, "step": 4665 }, { "epoch": 2.201791607732202, "grad_norm": 1.0687174797058105, "learning_rate": 4.998456898552012e-05, "loss": 0.1072, "num_input_tokens_seen": 4694848, "step": 4670 }, { "epoch": 2.2041489863272044, "grad_norm": 1.4882248640060425, "learning_rate": 4.9984205525907736e-05, "loss": 0.1934, "num_input_tokens_seen": 4699136, "step": 4675 }, { "epoch": 2.2065063649222063, "grad_norm": 0.15786118805408478, "learning_rate": 4.998383783689356e-05, "loss": 0.1687, "num_input_tokens_seen": 4704384, "step": 4680 }, { "epoch": 2.2088637435172087, "grad_norm": 0.8197078108787537, "learning_rate": 4.998346591853984e-05, "loss": 0.2021, "num_input_tokens_seen": 4709632, "step": 4685 }, { "epoch": 2.211221122112211, "grad_norm": 0.6734718084335327, "learning_rate": 4.998308977090953e-05, "loss": 0.1126, "num_input_tokens_seen": 4714560, "step": 4690 }, { "epoch": 2.2135785007072135, "grad_norm": 0.47998833656311035, "learning_rate": 4.998270939406632e-05, "loss": 0.1309, "num_input_tokens_seen": 4719008, "step": 4695 }, { "epoch": 2.215935879302216, "grad_norm": 0.5677729249000549, "learning_rate": 4.9982324788074585e-05, "loss": 0.1533, "num_input_tokens_seen": 4723136, "step": 4700 }, { "epoch": 2.2182932578972183, "grad_norm": 0.5404605269432068, "learning_rate": 4.9981935952999445e-05, "loss": 0.0857, "num_input_tokens_seen": 4729024, "step": 4705 }, { "epoch": 2.2206506364922207, "grad_norm": 1.3808344602584839, "learning_rate": 4.998154288890671e-05, "loss": 0.1657, "num_input_tokens_seen": 4733152, "step": 4710 }, { "epoch": 2.223008015087223, "grad_norm": 0.5632413625717163, "learning_rate": 4.998114559586293e-05, "loss": 0.2276, "num_input_tokens_seen": 4737824, "step": 4715 }, { "epoch": 2.2253653936822255, "grad_norm": 0.0905737578868866, "learning_rate": 4.998074407393536e-05, "loss": 0.1667, "num_input_tokens_seen": 4741792, "step": 4720 }, { "epoch": 2.227722772277228, "grad_norm": 1.1115504503250122, "learning_rate": 4.9980338323191964e-05, "loss": 0.1881, "num_input_tokens_seen": 4746976, "step": 4725 }, { "epoch": 2.23008015087223, "grad_norm": 0.6904681324958801, "learning_rate": 4.9979928343701435e-05, "loss": 0.0887, "num_input_tokens_seen": 4752544, "step": 4730 }, { "epoch": 2.2324375294672323, "grad_norm": 1.262628197669983, "learning_rate": 4.997951413553317e-05, "loss": 0.2566, "num_input_tokens_seen": 4756928, "step": 4735 }, { "epoch": 2.2347949080622347, "grad_norm": 2.1529126167297363, "learning_rate": 4.9979095698757286e-05, "loss": 0.2493, "num_input_tokens_seen": 4764800, "step": 4740 }, { "epoch": 2.237152286657237, "grad_norm": 0.1564795821905136, "learning_rate": 4.9978673033444624e-05, "loss": 0.1484, "num_input_tokens_seen": 4771712, "step": 4745 }, { "epoch": 2.2395096652522395, "grad_norm": 0.3303097188472748, "learning_rate": 4.997824613966673e-05, "loss": 0.0937, "num_input_tokens_seen": 4776896, "step": 4750 }, { "epoch": 2.241867043847242, "grad_norm": 0.13487450778484344, "learning_rate": 4.9977815017495874e-05, "loss": 0.4588, "num_input_tokens_seen": 4782496, "step": 4755 }, { "epoch": 2.2442244224422443, "grad_norm": 1.7263858318328857, "learning_rate": 4.997737966700503e-05, "loss": 0.3253, "num_input_tokens_seen": 4788480, "step": 4760 }, { "epoch": 2.2465818010372467, "grad_norm": 0.4647906720638275, "learning_rate": 4.99769400882679e-05, "loss": 0.1231, "num_input_tokens_seen": 4793024, "step": 4765 }, { "epoch": 2.248939179632249, "grad_norm": 0.9206646680831909, "learning_rate": 4.997649628135891e-05, "loss": 0.0573, "num_input_tokens_seen": 4798336, "step": 4770 }, { "epoch": 2.251296558227251, "grad_norm": 1.650735855102539, "learning_rate": 4.997604824635316e-05, "loss": 0.4415, "num_input_tokens_seen": 4802336, "step": 4775 }, { "epoch": 2.2536539368222535, "grad_norm": 0.9067346453666687, "learning_rate": 4.9975595983326514e-05, "loss": 0.1112, "num_input_tokens_seen": 4807776, "step": 4780 }, { "epoch": 2.256011315417256, "grad_norm": 0.8629924058914185, "learning_rate": 4.997513949235554e-05, "loss": 0.1002, "num_input_tokens_seen": 4812896, "step": 4785 }, { "epoch": 2.2583686940122583, "grad_norm": 1.824482798576355, "learning_rate": 4.9974678773517495e-05, "loss": 0.1804, "num_input_tokens_seen": 4818560, "step": 4790 }, { "epoch": 2.2607260726072607, "grad_norm": 0.831035852432251, "learning_rate": 4.9974213826890384e-05, "loss": 0.2054, "num_input_tokens_seen": 4822912, "step": 4795 }, { "epoch": 2.263083451202263, "grad_norm": 0.24078220129013062, "learning_rate": 4.997374465255291e-05, "loss": 0.0719, "num_input_tokens_seen": 4827456, "step": 4800 }, { "epoch": 2.2654408297972655, "grad_norm": 1.0854710340499878, "learning_rate": 4.997327125058449e-05, "loss": 0.2403, "num_input_tokens_seen": 4832576, "step": 4805 }, { "epoch": 2.267798208392268, "grad_norm": 0.47580239176750183, "learning_rate": 4.997279362106527e-05, "loss": 0.2088, "num_input_tokens_seen": 4836736, "step": 4810 }, { "epoch": 2.2701555869872703, "grad_norm": 0.7440029382705688, "learning_rate": 4.9972311764076105e-05, "loss": 0.1471, "num_input_tokens_seen": 4841312, "step": 4815 }, { "epoch": 2.2725129655822727, "grad_norm": 0.7475154995918274, "learning_rate": 4.997182567969857e-05, "loss": 0.1743, "num_input_tokens_seen": 4845984, "step": 4820 }, { "epoch": 2.274870344177275, "grad_norm": 2.136636972427368, "learning_rate": 4.997133536801494e-05, "loss": 0.2044, "num_input_tokens_seen": 4851104, "step": 4825 }, { "epoch": 2.2772277227722775, "grad_norm": 0.9862480759620667, "learning_rate": 4.997084082910822e-05, "loss": 0.1128, "num_input_tokens_seen": 4856384, "step": 4830 }, { "epoch": 2.2795851013672794, "grad_norm": 1.2452646493911743, "learning_rate": 4.997034206306214e-05, "loss": 0.1627, "num_input_tokens_seen": 4861632, "step": 4835 }, { "epoch": 2.281942479962282, "grad_norm": 0.435690701007843, "learning_rate": 4.996983906996111e-05, "loss": 0.3689, "num_input_tokens_seen": 4866400, "step": 4840 }, { "epoch": 2.2842998585572842, "grad_norm": 0.6352941393852234, "learning_rate": 4.996933184989029e-05, "loss": 0.1265, "num_input_tokens_seen": 4871744, "step": 4845 }, { "epoch": 2.2866572371522866, "grad_norm": 0.24941281974315643, "learning_rate": 4.996882040293555e-05, "loss": 0.1261, "num_input_tokens_seen": 4876992, "step": 4850 }, { "epoch": 2.289014615747289, "grad_norm": 1.9803135395050049, "learning_rate": 4.996830472918345e-05, "loss": 0.3047, "num_input_tokens_seen": 4882304, "step": 4855 }, { "epoch": 2.2913719943422914, "grad_norm": 0.4120808243751526, "learning_rate": 4.9967784828721304e-05, "loss": 0.2495, "num_input_tokens_seen": 4886080, "step": 4860 }, { "epoch": 2.293729372937294, "grad_norm": 0.5052905082702637, "learning_rate": 4.9967260701637115e-05, "loss": 0.16, "num_input_tokens_seen": 4891264, "step": 4865 }, { "epoch": 2.2960867515322962, "grad_norm": 0.4823860824108124, "learning_rate": 4.9966732348019605e-05, "loss": 0.2015, "num_input_tokens_seen": 4897856, "step": 4870 }, { "epoch": 2.298444130127298, "grad_norm": 2.4892678260803223, "learning_rate": 4.996619976795823e-05, "loss": 0.1614, "num_input_tokens_seen": 4902240, "step": 4875 }, { "epoch": 2.3008015087223006, "grad_norm": 0.7943556904792786, "learning_rate": 4.9965662961543126e-05, "loss": 0.2593, "num_input_tokens_seen": 4906912, "step": 4880 }, { "epoch": 2.303158887317303, "grad_norm": 0.5219776034355164, "learning_rate": 4.996512192886518e-05, "loss": 0.1337, "num_input_tokens_seen": 4911488, "step": 4885 }, { "epoch": 2.3055162659123054, "grad_norm": 1.2508922815322876, "learning_rate": 4.996457667001597e-05, "loss": 0.1365, "num_input_tokens_seen": 4916032, "step": 4890 }, { "epoch": 2.307873644507308, "grad_norm": 1.6076289415359497, "learning_rate": 4.996402718508781e-05, "loss": 0.1035, "num_input_tokens_seen": 4921152, "step": 4895 }, { "epoch": 2.31023102310231, "grad_norm": 1.8105944395065308, "learning_rate": 4.996347347417371e-05, "loss": 0.1787, "num_input_tokens_seen": 4925280, "step": 4900 }, { "epoch": 2.3125884016973126, "grad_norm": 1.5187188386917114, "learning_rate": 4.99629155373674e-05, "loss": 0.2647, "num_input_tokens_seen": 4930496, "step": 4905 }, { "epoch": 2.314945780292315, "grad_norm": 0.6441831588745117, "learning_rate": 4.996235337476335e-05, "loss": 0.2122, "num_input_tokens_seen": 4935968, "step": 4910 }, { "epoch": 2.3173031588873174, "grad_norm": 0.22794204950332642, "learning_rate": 4.9961786986456707e-05, "loss": 0.0757, "num_input_tokens_seen": 4941312, "step": 4915 }, { "epoch": 2.31966053748232, "grad_norm": 0.2612764537334442, "learning_rate": 4.996121637254335e-05, "loss": 0.0924, "num_input_tokens_seen": 4946208, "step": 4920 }, { "epoch": 2.322017916077322, "grad_norm": 2.31731915473938, "learning_rate": 4.996064153311988e-05, "loss": 0.3685, "num_input_tokens_seen": 4951072, "step": 4925 }, { "epoch": 2.3243752946723246, "grad_norm": 0.7369402050971985, "learning_rate": 4.99600624682836e-05, "loss": 0.1283, "num_input_tokens_seen": 4954624, "step": 4930 }, { "epoch": 2.3267326732673266, "grad_norm": 0.6078770756721497, "learning_rate": 4.995947917813255e-05, "loss": 0.0904, "num_input_tokens_seen": 4959584, "step": 4935 }, { "epoch": 2.329090051862329, "grad_norm": 2.8328158855438232, "learning_rate": 4.995889166276546e-05, "loss": 0.2194, "num_input_tokens_seen": 4963776, "step": 4940 }, { "epoch": 2.3314474304573314, "grad_norm": 0.5394704937934875, "learning_rate": 4.9958299922281785e-05, "loss": 0.1594, "num_input_tokens_seen": 4968448, "step": 4945 }, { "epoch": 2.333804809052334, "grad_norm": 0.9310562014579773, "learning_rate": 4.995770395678171e-05, "loss": 0.1475, "num_input_tokens_seen": 4973280, "step": 4950 }, { "epoch": 2.336162187647336, "grad_norm": 0.07997889816761017, "learning_rate": 4.995710376636611e-05, "loss": 0.223, "num_input_tokens_seen": 4977984, "step": 4955 }, { "epoch": 2.3385195662423386, "grad_norm": 0.3157879710197449, "learning_rate": 4.995649935113658e-05, "loss": 0.2644, "num_input_tokens_seen": 4983136, "step": 4960 }, { "epoch": 2.340876944837341, "grad_norm": 0.3974154591560364, "learning_rate": 4.995589071119546e-05, "loss": 0.1948, "num_input_tokens_seen": 4988032, "step": 4965 }, { "epoch": 2.3432343234323434, "grad_norm": 0.5581972599029541, "learning_rate": 4.995527784664577e-05, "loss": 0.1785, "num_input_tokens_seen": 4994176, "step": 4970 }, { "epoch": 2.3455917020273453, "grad_norm": 1.3086241483688354, "learning_rate": 4.9954660757591244e-05, "loss": 0.1146, "num_input_tokens_seen": 5000480, "step": 4975 }, { "epoch": 2.3479490806223478, "grad_norm": 0.26329290866851807, "learning_rate": 4.995403944413636e-05, "loss": 0.087, "num_input_tokens_seen": 5004960, "step": 4980 }, { "epoch": 2.35030645921735, "grad_norm": 0.9417897462844849, "learning_rate": 4.9953413906386305e-05, "loss": 0.0908, "num_input_tokens_seen": 5009120, "step": 4985 }, { "epoch": 2.3526638378123526, "grad_norm": 0.2720443904399872, "learning_rate": 4.995278414444695e-05, "loss": 0.0896, "num_input_tokens_seen": 5013984, "step": 4990 }, { "epoch": 2.355021216407355, "grad_norm": 0.593783974647522, "learning_rate": 4.995215015842492e-05, "loss": 0.1746, "num_input_tokens_seen": 5018880, "step": 4995 }, { "epoch": 2.3573785950023574, "grad_norm": 1.6793580055236816, "learning_rate": 4.995151194842753e-05, "loss": 0.2027, "num_input_tokens_seen": 5024000, "step": 5000 }, { "epoch": 2.3597359735973598, "grad_norm": 1.176108479499817, "learning_rate": 4.995086951456282e-05, "loss": 0.1012, "num_input_tokens_seen": 5028064, "step": 5005 }, { "epoch": 2.362093352192362, "grad_norm": 0.6055029630661011, "learning_rate": 4.995022285693954e-05, "loss": 0.1164, "num_input_tokens_seen": 5032768, "step": 5010 }, { "epoch": 2.3644507307873646, "grad_norm": 2.1345300674438477, "learning_rate": 4.9949571975667166e-05, "loss": 0.4146, "num_input_tokens_seen": 5037024, "step": 5015 }, { "epoch": 2.366808109382367, "grad_norm": 0.32145175337791443, "learning_rate": 4.994891687085587e-05, "loss": 0.182, "num_input_tokens_seen": 5042688, "step": 5020 }, { "epoch": 2.3691654879773694, "grad_norm": 0.9758346676826477, "learning_rate": 4.994825754261656e-05, "loss": 0.1114, "num_input_tokens_seen": 5049120, "step": 5025 }, { "epoch": 2.3715228665723718, "grad_norm": 0.2466670572757721, "learning_rate": 4.994759399106085e-05, "loss": 0.071, "num_input_tokens_seen": 5054560, "step": 5030 }, { "epoch": 2.3738802451673737, "grad_norm": 0.24671678245067596, "learning_rate": 4.994692621630105e-05, "loss": 0.0988, "num_input_tokens_seen": 5059232, "step": 5035 }, { "epoch": 2.376237623762376, "grad_norm": 0.7738720178604126, "learning_rate": 4.9946254218450225e-05, "loss": 0.264, "num_input_tokens_seen": 5063936, "step": 5040 }, { "epoch": 2.3785950023573785, "grad_norm": 0.06726626306772232, "learning_rate": 4.9945577997622126e-05, "loss": 0.2079, "num_input_tokens_seen": 5069920, "step": 5045 }, { "epoch": 2.380952380952381, "grad_norm": 2.324503183364868, "learning_rate": 4.994489755393122e-05, "loss": 0.268, "num_input_tokens_seen": 5076128, "step": 5050 }, { "epoch": 2.3833097595473833, "grad_norm": 0.08988231420516968, "learning_rate": 4.9944212887492704e-05, "loss": 0.1646, "num_input_tokens_seen": 5080448, "step": 5055 }, { "epoch": 2.3856671381423857, "grad_norm": 2.611508846282959, "learning_rate": 4.9943523998422465e-05, "loss": 0.1405, "num_input_tokens_seen": 5085312, "step": 5060 }, { "epoch": 2.388024516737388, "grad_norm": 1.0769286155700684, "learning_rate": 4.9942830886837134e-05, "loss": 0.2061, "num_input_tokens_seen": 5089632, "step": 5065 }, { "epoch": 2.3903818953323905, "grad_norm": 0.22180695831775665, "learning_rate": 4.994213355285404e-05, "loss": 0.1957, "num_input_tokens_seen": 5094208, "step": 5070 }, { "epoch": 2.3927392739273925, "grad_norm": 1.3687427043914795, "learning_rate": 4.994143199659123e-05, "loss": 0.2041, "num_input_tokens_seen": 5099424, "step": 5075 }, { "epoch": 2.395096652522395, "grad_norm": 1.7791980504989624, "learning_rate": 4.994072621816746e-05, "loss": 0.2556, "num_input_tokens_seen": 5104896, "step": 5080 }, { "epoch": 2.3974540311173973, "grad_norm": 0.7914581298828125, "learning_rate": 4.9940016217702216e-05, "loss": 0.272, "num_input_tokens_seen": 5109952, "step": 5085 }, { "epoch": 2.3998114097123997, "grad_norm": 0.09463602304458618, "learning_rate": 4.993930199531569e-05, "loss": 0.4237, "num_input_tokens_seen": 5115040, "step": 5090 }, { "epoch": 2.402168788307402, "grad_norm": 1.9998873472213745, "learning_rate": 4.9938583551128774e-05, "loss": 0.2114, "num_input_tokens_seen": 5120864, "step": 5095 }, { "epoch": 2.4045261669024045, "grad_norm": 0.6541904211044312, "learning_rate": 4.993786088526309e-05, "loss": 0.1339, "num_input_tokens_seen": 5125408, "step": 5100 }, { "epoch": 2.406883545497407, "grad_norm": 2.03721284866333, "learning_rate": 4.993713399784098e-05, "loss": 0.1197, "num_input_tokens_seen": 5130560, "step": 5105 }, { "epoch": 2.4092409240924093, "grad_norm": 0.6054621338844299, "learning_rate": 4.9936402888985504e-05, "loss": 0.3248, "num_input_tokens_seen": 5134784, "step": 5110 }, { "epoch": 2.4115983026874117, "grad_norm": 1.492087483406067, "learning_rate": 4.9935667558820405e-05, "loss": 0.2367, "num_input_tokens_seen": 5142208, "step": 5115 }, { "epoch": 2.413955681282414, "grad_norm": 1.8380852937698364, "learning_rate": 4.993492800747017e-05, "loss": 0.2467, "num_input_tokens_seen": 5147744, "step": 5120 }, { "epoch": 2.4163130598774165, "grad_norm": 0.20431792736053467, "learning_rate": 4.993418423506e-05, "loss": 0.098, "num_input_tokens_seen": 5154272, "step": 5125 }, { "epoch": 2.418670438472419, "grad_norm": 1.265791893005371, "learning_rate": 4.99334362417158e-05, "loss": 0.1549, "num_input_tokens_seen": 5159488, "step": 5130 }, { "epoch": 2.421027817067421, "grad_norm": 0.6046030521392822, "learning_rate": 4.993268402756418e-05, "loss": 0.2419, "num_input_tokens_seen": 5164352, "step": 5135 }, { "epoch": 2.4233851956624233, "grad_norm": 1.168443202972412, "learning_rate": 4.99319275927325e-05, "loss": 0.1835, "num_input_tokens_seen": 5169920, "step": 5140 }, { "epoch": 2.4257425742574257, "grad_norm": 1.0509158372879028, "learning_rate": 4.9931166937348785e-05, "loss": 0.1705, "num_input_tokens_seen": 5173792, "step": 5145 }, { "epoch": 2.428099952852428, "grad_norm": 0.9115732908248901, "learning_rate": 4.993040206154183e-05, "loss": 0.2339, "num_input_tokens_seen": 5181568, "step": 5150 }, { "epoch": 2.4304573314474305, "grad_norm": 0.7547272443771362, "learning_rate": 4.9929632965441084e-05, "loss": 0.1507, "num_input_tokens_seen": 5188992, "step": 5155 }, { "epoch": 2.432814710042433, "grad_norm": 0.4083046317100525, "learning_rate": 4.992885964917676e-05, "loss": 0.088, "num_input_tokens_seen": 5194176, "step": 5160 }, { "epoch": 2.4351720886374353, "grad_norm": 1.8501988649368286, "learning_rate": 4.9928082112879776e-05, "loss": 0.3511, "num_input_tokens_seen": 5198656, "step": 5165 }, { "epoch": 2.4375294672324377, "grad_norm": 0.7602760195732117, "learning_rate": 4.992730035668174e-05, "loss": 0.2211, "num_input_tokens_seen": 5203968, "step": 5170 }, { "epoch": 2.4398868458274396, "grad_norm": 0.13579094409942627, "learning_rate": 4.992651438071499e-05, "loss": 0.2857, "num_input_tokens_seen": 5208384, "step": 5175 }, { "epoch": 2.442244224422442, "grad_norm": 0.053272832185029984, "learning_rate": 4.9925724185112594e-05, "loss": 0.1075, "num_input_tokens_seen": 5212544, "step": 5180 }, { "epoch": 2.4446016030174444, "grad_norm": 0.32437512278556824, "learning_rate": 4.99249297700083e-05, "loss": 0.2938, "num_input_tokens_seen": 5218080, "step": 5185 }, { "epoch": 2.446958981612447, "grad_norm": 0.8970376253128052, "learning_rate": 4.99241311355366e-05, "loss": 0.1628, "num_input_tokens_seen": 5222784, "step": 5190 }, { "epoch": 2.4493163602074493, "grad_norm": 1.4876964092254639, "learning_rate": 4.992332828183269e-05, "loss": 0.3317, "num_input_tokens_seen": 5228832, "step": 5195 }, { "epoch": 2.4516737388024517, "grad_norm": 0.5608078837394714, "learning_rate": 4.992252120903247e-05, "loss": 0.2336, "num_input_tokens_seen": 5233536, "step": 5200 }, { "epoch": 2.454031117397454, "grad_norm": 0.7373106479644775, "learning_rate": 4.9921709917272576e-05, "loss": 0.205, "num_input_tokens_seen": 5238560, "step": 5205 }, { "epoch": 2.4563884959924565, "grad_norm": 0.6729576587677002, "learning_rate": 4.9920894406690336e-05, "loss": 0.1581, "num_input_tokens_seen": 5242560, "step": 5210 }, { "epoch": 2.458745874587459, "grad_norm": 1.8736399412155151, "learning_rate": 4.99200746774238e-05, "loss": 0.2302, "num_input_tokens_seen": 5247552, "step": 5215 }, { "epoch": 2.4611032531824613, "grad_norm": 0.6901644468307495, "learning_rate": 4.991925072961174e-05, "loss": 0.1578, "num_input_tokens_seen": 5252448, "step": 5220 }, { "epoch": 2.4634606317774637, "grad_norm": 0.42308348417282104, "learning_rate": 4.991842256339364e-05, "loss": 0.0761, "num_input_tokens_seen": 5256544, "step": 5225 }, { "epoch": 2.465818010372466, "grad_norm": 0.2732510268688202, "learning_rate": 4.991759017890969e-05, "loss": 0.1943, "num_input_tokens_seen": 5261184, "step": 5230 }, { "epoch": 2.468175388967468, "grad_norm": 1.1812564134597778, "learning_rate": 4.991675357630081e-05, "loss": 0.1542, "num_input_tokens_seen": 5266144, "step": 5235 }, { "epoch": 2.4705327675624704, "grad_norm": 0.9281294941902161, "learning_rate": 4.99159127557086e-05, "loss": 0.1958, "num_input_tokens_seen": 5272960, "step": 5240 }, { "epoch": 2.472890146157473, "grad_norm": 0.3070501685142517, "learning_rate": 4.991506771727541e-05, "loss": 0.1902, "num_input_tokens_seen": 5279136, "step": 5245 }, { "epoch": 2.4752475247524752, "grad_norm": 1.4542744159698486, "learning_rate": 4.991421846114429e-05, "loss": 0.2063, "num_input_tokens_seen": 5284448, "step": 5250 }, { "epoch": 2.4776049033474776, "grad_norm": 1.2761905193328857, "learning_rate": 4.991336498745901e-05, "loss": 0.1641, "num_input_tokens_seen": 5289408, "step": 5255 }, { "epoch": 2.47996228194248, "grad_norm": 0.8219074606895447, "learning_rate": 4.991250729636403e-05, "loss": 0.1601, "num_input_tokens_seen": 5294688, "step": 5260 }, { "epoch": 2.4823196605374824, "grad_norm": 0.21591699123382568, "learning_rate": 4.991164538800457e-05, "loss": 0.2242, "num_input_tokens_seen": 5299360, "step": 5265 }, { "epoch": 2.484677039132485, "grad_norm": 1.4328784942626953, "learning_rate": 4.991077926252651e-05, "loss": 0.1753, "num_input_tokens_seen": 5305472, "step": 5270 }, { "epoch": 2.487034417727487, "grad_norm": 0.24700258672237396, "learning_rate": 4.9909908920076484e-05, "loss": 0.1033, "num_input_tokens_seen": 5311680, "step": 5275 }, { "epoch": 2.489391796322489, "grad_norm": 1.1266511678695679, "learning_rate": 4.990903436080183e-05, "loss": 0.2243, "num_input_tokens_seen": 5317184, "step": 5280 }, { "epoch": 2.4917491749174916, "grad_norm": 0.2066873013973236, "learning_rate": 4.990815558485058e-05, "loss": 0.0337, "num_input_tokens_seen": 5321952, "step": 5285 }, { "epoch": 2.494106553512494, "grad_norm": 1.7461780309677124, "learning_rate": 4.9907272592371515e-05, "loss": 0.1322, "num_input_tokens_seen": 5328800, "step": 5290 }, { "epoch": 2.4964639321074964, "grad_norm": 0.3321126699447632, "learning_rate": 4.9906385383514095e-05, "loss": 0.201, "num_input_tokens_seen": 5333472, "step": 5295 }, { "epoch": 2.498821310702499, "grad_norm": 1.911885380744934, "learning_rate": 4.990549395842852e-05, "loss": 0.1395, "num_input_tokens_seen": 5338368, "step": 5300 }, { "epoch": 2.501178689297501, "grad_norm": 2.0693821907043457, "learning_rate": 4.990459831726568e-05, "loss": 0.215, "num_input_tokens_seen": 5343200, "step": 5305 }, { "epoch": 2.5035360678925036, "grad_norm": 0.8696852922439575, "learning_rate": 4.9903698460177214e-05, "loss": 0.3365, "num_input_tokens_seen": 5348128, "step": 5310 }, { "epoch": 2.505893446487506, "grad_norm": 0.15307645499706268, "learning_rate": 4.9902794387315434e-05, "loss": 0.2673, "num_input_tokens_seen": 5353504, "step": 5315 }, { "epoch": 2.5082508250825084, "grad_norm": 0.8688222765922546, "learning_rate": 4.9901886098833396e-05, "loss": 0.3361, "num_input_tokens_seen": 5357760, "step": 5320 }, { "epoch": 2.510608203677511, "grad_norm": 0.8109961152076721, "learning_rate": 4.9900973594884834e-05, "loss": 0.1863, "num_input_tokens_seen": 5362144, "step": 5325 }, { "epoch": 2.512965582272513, "grad_norm": 0.15345638990402222, "learning_rate": 4.990005687562426e-05, "loss": 0.2443, "num_input_tokens_seen": 5366560, "step": 5330 }, { "epoch": 2.515322960867515, "grad_norm": 0.4302290976047516, "learning_rate": 4.989913594120682e-05, "loss": 0.1653, "num_input_tokens_seen": 5371168, "step": 5335 }, { "epoch": 2.5176803394625176, "grad_norm": 0.5640825629234314, "learning_rate": 4.989821079178843e-05, "loss": 0.1921, "num_input_tokens_seen": 5376096, "step": 5340 }, { "epoch": 2.52003771805752, "grad_norm": 1.3876456022262573, "learning_rate": 4.98972814275257e-05, "loss": 0.1516, "num_input_tokens_seen": 5380928, "step": 5345 }, { "epoch": 2.5223950966525224, "grad_norm": 1.907266616821289, "learning_rate": 4.9896347848575964e-05, "loss": 0.257, "num_input_tokens_seen": 5385600, "step": 5350 }, { "epoch": 2.5247524752475248, "grad_norm": 0.9103085398674011, "learning_rate": 4.989541005509725e-05, "loss": 0.1552, "num_input_tokens_seen": 5389760, "step": 5355 }, { "epoch": 2.527109853842527, "grad_norm": 3.9296391010284424, "learning_rate": 4.989446804724831e-05, "loss": 0.2798, "num_input_tokens_seen": 5395296, "step": 5360 }, { "epoch": 2.5294672324375296, "grad_norm": 0.3420675992965698, "learning_rate": 4.9893521825188615e-05, "loss": 0.2447, "num_input_tokens_seen": 5399072, "step": 5365 }, { "epoch": 2.531824611032532, "grad_norm": 0.46936243772506714, "learning_rate": 4.989257138907834e-05, "loss": 0.1394, "num_input_tokens_seen": 5404128, "step": 5370 }, { "epoch": 2.534181989627534, "grad_norm": 0.4488341510295868, "learning_rate": 4.989161673907839e-05, "loss": 0.0794, "num_input_tokens_seen": 5408736, "step": 5375 }, { "epoch": 2.5365393682225363, "grad_norm": 0.18484720587730408, "learning_rate": 4.989065787535035e-05, "loss": 0.1039, "num_input_tokens_seen": 5414080, "step": 5380 }, { "epoch": 2.5388967468175387, "grad_norm": 0.4689485728740692, "learning_rate": 4.9889694798056555e-05, "loss": 0.1816, "num_input_tokens_seen": 5418784, "step": 5385 }, { "epoch": 2.541254125412541, "grad_norm": 1.699514627456665, "learning_rate": 4.988872750736003e-05, "loss": 0.1609, "num_input_tokens_seen": 5424704, "step": 5390 }, { "epoch": 2.5436115040075435, "grad_norm": 1.262975811958313, "learning_rate": 4.9887756003424534e-05, "loss": 0.2696, "num_input_tokens_seen": 5428928, "step": 5395 }, { "epoch": 2.545968882602546, "grad_norm": 0.4768259823322296, "learning_rate": 4.988678028641451e-05, "loss": 0.1765, "num_input_tokens_seen": 5433856, "step": 5400 }, { "epoch": 2.5483262611975483, "grad_norm": 0.4650203585624695, "learning_rate": 4.988580035649515e-05, "loss": 0.2205, "num_input_tokens_seen": 5438688, "step": 5405 }, { "epoch": 2.5506836397925507, "grad_norm": 0.9081720113754272, "learning_rate": 4.9884816213832306e-05, "loss": 0.2364, "num_input_tokens_seen": 5444672, "step": 5410 }, { "epoch": 2.553041018387553, "grad_norm": 2.5564262866973877, "learning_rate": 4.9883827858592615e-05, "loss": 0.247, "num_input_tokens_seen": 5450720, "step": 5415 }, { "epoch": 2.5553983969825556, "grad_norm": 1.6081609725952148, "learning_rate": 4.988283529094336e-05, "loss": 0.178, "num_input_tokens_seen": 5454432, "step": 5420 }, { "epoch": 2.557755775577558, "grad_norm": 0.39324137568473816, "learning_rate": 4.988183851105258e-05, "loss": 0.1489, "num_input_tokens_seen": 5459424, "step": 5425 }, { "epoch": 2.5601131541725604, "grad_norm": 1.472010612487793, "learning_rate": 4.988083751908901e-05, "loss": 0.2034, "num_input_tokens_seen": 5464288, "step": 5430 }, { "epoch": 2.5624705327675623, "grad_norm": 0.40388554334640503, "learning_rate": 4.9879832315222096e-05, "loss": 0.2182, "num_input_tokens_seen": 5468736, "step": 5435 }, { "epoch": 2.5648279113625647, "grad_norm": 1.0456775426864624, "learning_rate": 4.987882289962201e-05, "loss": 0.1144, "num_input_tokens_seen": 5474400, "step": 5440 }, { "epoch": 2.567185289957567, "grad_norm": 0.38427773118019104, "learning_rate": 4.987780927245963e-05, "loss": 0.0851, "num_input_tokens_seen": 5479808, "step": 5445 }, { "epoch": 2.5695426685525695, "grad_norm": 1.04520583152771, "learning_rate": 4.987679143390653e-05, "loss": 0.173, "num_input_tokens_seen": 5485632, "step": 5450 }, { "epoch": 2.571900047147572, "grad_norm": 0.5170725584030151, "learning_rate": 4.987576938413504e-05, "loss": 0.1999, "num_input_tokens_seen": 5492128, "step": 5455 }, { "epoch": 2.5742574257425743, "grad_norm": 1.8279889822006226, "learning_rate": 4.987474312331815e-05, "loss": 0.1993, "num_input_tokens_seen": 5496480, "step": 5460 }, { "epoch": 2.5766148043375767, "grad_norm": 0.4718334972858429, "learning_rate": 4.98737126516296e-05, "loss": 0.1127, "num_input_tokens_seen": 5501344, "step": 5465 }, { "epoch": 2.578972182932579, "grad_norm": 1.269897222518921, "learning_rate": 4.987267796924383e-05, "loss": 0.1772, "num_input_tokens_seen": 5506304, "step": 5470 }, { "epoch": 2.581329561527581, "grad_norm": 0.12202814966440201, "learning_rate": 4.987163907633599e-05, "loss": 0.0877, "num_input_tokens_seen": 5510016, "step": 5475 }, { "epoch": 2.5836869401225835, "grad_norm": 2.400843381881714, "learning_rate": 4.9870595973081956e-05, "loss": 0.2462, "num_input_tokens_seen": 5515456, "step": 5480 }, { "epoch": 2.586044318717586, "grad_norm": 2.5654847621917725, "learning_rate": 4.98695486596583e-05, "loss": 0.497, "num_input_tokens_seen": 5521568, "step": 5485 }, { "epoch": 2.5884016973125883, "grad_norm": 0.8213427066802979, "learning_rate": 4.986849713624231e-05, "loss": 0.1149, "num_input_tokens_seen": 5526304, "step": 5490 }, { "epoch": 2.5907590759075907, "grad_norm": 0.3010060787200928, "learning_rate": 4.986744140301201e-05, "loss": 0.1624, "num_input_tokens_seen": 5532608, "step": 5495 }, { "epoch": 2.593116454502593, "grad_norm": 0.2458294928073883, "learning_rate": 4.98663814601461e-05, "loss": 0.1642, "num_input_tokens_seen": 5537312, "step": 5500 }, { "epoch": 2.5954738330975955, "grad_norm": 0.9270186424255371, "learning_rate": 4.9865317307824016e-05, "loss": 0.1713, "num_input_tokens_seen": 5542720, "step": 5505 }, { "epoch": 2.597831211692598, "grad_norm": 0.7952637076377869, "learning_rate": 4.98642489462259e-05, "loss": 0.1454, "num_input_tokens_seen": 5546816, "step": 5510 }, { "epoch": 2.6001885902876003, "grad_norm": 0.034202855080366135, "learning_rate": 4.986317637553261e-05, "loss": 0.1779, "num_input_tokens_seen": 5551936, "step": 5515 }, { "epoch": 2.6025459688826027, "grad_norm": 1.071338176727295, "learning_rate": 4.9862099595925716e-05, "loss": 0.2325, "num_input_tokens_seen": 5557120, "step": 5520 }, { "epoch": 2.604903347477605, "grad_norm": 1.1543712615966797, "learning_rate": 4.9861018607587485e-05, "loss": 0.2029, "num_input_tokens_seen": 5561728, "step": 5525 }, { "epoch": 2.6072607260726075, "grad_norm": 1.4001926183700562, "learning_rate": 4.985993341070093e-05, "loss": 0.1595, "num_input_tokens_seen": 5566656, "step": 5530 }, { "epoch": 2.6096181046676095, "grad_norm": 0.7087225317955017, "learning_rate": 4.985884400544974e-05, "loss": 0.1446, "num_input_tokens_seen": 5570720, "step": 5535 }, { "epoch": 2.611975483262612, "grad_norm": 2.770397424697876, "learning_rate": 4.9857750392018346e-05, "loss": 0.2652, "num_input_tokens_seen": 5575328, "step": 5540 }, { "epoch": 2.6143328618576143, "grad_norm": 0.5926831364631653, "learning_rate": 4.985665257059186e-05, "loss": 0.1351, "num_input_tokens_seen": 5580352, "step": 5545 }, { "epoch": 2.6166902404526167, "grad_norm": 0.3387841582298279, "learning_rate": 4.9855550541356145e-05, "loss": 0.2087, "num_input_tokens_seen": 5585440, "step": 5550 }, { "epoch": 2.619047619047619, "grad_norm": 0.783606767654419, "learning_rate": 4.9854444304497735e-05, "loss": 0.1121, "num_input_tokens_seen": 5590208, "step": 5555 }, { "epoch": 2.6214049976426215, "grad_norm": 0.9796265363693237, "learning_rate": 4.985333386020392e-05, "loss": 0.1772, "num_input_tokens_seen": 5595872, "step": 5560 }, { "epoch": 2.623762376237624, "grad_norm": 1.8576807975769043, "learning_rate": 4.9852219208662666e-05, "loss": 0.2124, "num_input_tokens_seen": 5601536, "step": 5565 }, { "epoch": 2.6261197548326263, "grad_norm": 0.04427110776305199, "learning_rate": 4.9851100350062665e-05, "loss": 0.2902, "num_input_tokens_seen": 5605760, "step": 5570 }, { "epoch": 2.6284771334276282, "grad_norm": 1.6337789297103882, "learning_rate": 4.984997728459332e-05, "loss": 0.1525, "num_input_tokens_seen": 5610976, "step": 5575 }, { "epoch": 2.6308345120226306, "grad_norm": 0.24099797010421753, "learning_rate": 4.984885001244476e-05, "loss": 0.1203, "num_input_tokens_seen": 5616352, "step": 5580 }, { "epoch": 2.633191890617633, "grad_norm": 0.3549099564552307, "learning_rate": 4.9847718533807797e-05, "loss": 0.1216, "num_input_tokens_seen": 5621312, "step": 5585 }, { "epoch": 2.6355492692126354, "grad_norm": 0.3078872561454773, "learning_rate": 4.9846582848873965e-05, "loss": 0.0769, "num_input_tokens_seen": 5626880, "step": 5590 }, { "epoch": 2.637906647807638, "grad_norm": 2.9085772037506104, "learning_rate": 4.984544295783554e-05, "loss": 0.2171, "num_input_tokens_seen": 5632192, "step": 5595 }, { "epoch": 2.6402640264026402, "grad_norm": 0.3442806005477905, "learning_rate": 4.9844298860885474e-05, "loss": 0.092, "num_input_tokens_seen": 5637536, "step": 5600 }, { "epoch": 2.6426214049976426, "grad_norm": 0.18846677243709564, "learning_rate": 4.984315055821744e-05, "loss": 0.1205, "num_input_tokens_seen": 5642944, "step": 5605 }, { "epoch": 2.644978783592645, "grad_norm": 0.9842584729194641, "learning_rate": 4.984199805002583e-05, "loss": 0.3221, "num_input_tokens_seen": 5647808, "step": 5610 }, { "epoch": 2.6473361621876474, "grad_norm": 0.4764038324356079, "learning_rate": 4.984084133650574e-05, "loss": 0.1417, "num_input_tokens_seen": 5652576, "step": 5615 }, { "epoch": 2.64969354078265, "grad_norm": 2.4978227615356445, "learning_rate": 4.983968041785298e-05, "loss": 0.1971, "num_input_tokens_seen": 5658592, "step": 5620 }, { "epoch": 2.6520509193776522, "grad_norm": 0.7672163844108582, "learning_rate": 4.983851529426409e-05, "loss": 0.121, "num_input_tokens_seen": 5663040, "step": 5625 }, { "epoch": 2.6544082979726547, "grad_norm": 0.2639606297016144, "learning_rate": 4.9837345965936296e-05, "loss": 0.2426, "num_input_tokens_seen": 5667456, "step": 5630 }, { "epoch": 2.6567656765676566, "grad_norm": 0.8072260022163391, "learning_rate": 4.983617243306754e-05, "loss": 0.0934, "num_input_tokens_seen": 5672736, "step": 5635 }, { "epoch": 2.659123055162659, "grad_norm": 1.1235036849975586, "learning_rate": 4.983499469585648e-05, "loss": 0.0935, "num_input_tokens_seen": 5678080, "step": 5640 }, { "epoch": 2.6614804337576614, "grad_norm": 0.37428051233291626, "learning_rate": 4.9833812754502495e-05, "loss": 0.087, "num_input_tokens_seen": 5682304, "step": 5645 }, { "epoch": 2.663837812352664, "grad_norm": 0.4526046812534332, "learning_rate": 4.983262660920567e-05, "loss": 0.1478, "num_input_tokens_seen": 5688032, "step": 5650 }, { "epoch": 2.666195190947666, "grad_norm": 1.2185932397842407, "learning_rate": 4.9831436260166783e-05, "loss": 0.2607, "num_input_tokens_seen": 5692256, "step": 5655 }, { "epoch": 2.6685525695426686, "grad_norm": 0.47504955530166626, "learning_rate": 4.9830241707587365e-05, "loss": 0.0676, "num_input_tokens_seen": 5696448, "step": 5660 }, { "epoch": 2.670909948137671, "grad_norm": 2.4572441577911377, "learning_rate": 4.982904295166961e-05, "loss": 0.2864, "num_input_tokens_seen": 5701088, "step": 5665 }, { "epoch": 2.6732673267326734, "grad_norm": 1.7570741176605225, "learning_rate": 4.982783999261647e-05, "loss": 0.1283, "num_input_tokens_seen": 5707008, "step": 5670 }, { "epoch": 2.6756247053276754, "grad_norm": 2.3902554512023926, "learning_rate": 4.982663283063155e-05, "loss": 0.2287, "num_input_tokens_seen": 5711648, "step": 5675 }, { "epoch": 2.677982083922678, "grad_norm": 2.0023369789123535, "learning_rate": 4.9825421465919244e-05, "loss": 0.2661, "num_input_tokens_seen": 5716384, "step": 5680 }, { "epoch": 2.68033946251768, "grad_norm": 0.684716522693634, "learning_rate": 4.9824205898684586e-05, "loss": 0.1101, "num_input_tokens_seen": 5721408, "step": 5685 }, { "epoch": 2.6826968411126826, "grad_norm": 0.6631907224655151, "learning_rate": 4.982298612913335e-05, "loss": 0.1255, "num_input_tokens_seen": 5727552, "step": 5690 }, { "epoch": 2.685054219707685, "grad_norm": 0.6407110095024109, "learning_rate": 4.982176215747205e-05, "loss": 0.0974, "num_input_tokens_seen": 5731424, "step": 5695 }, { "epoch": 2.6874115983026874, "grad_norm": 0.45942842960357666, "learning_rate": 4.982053398390786e-05, "loss": 0.129, "num_input_tokens_seen": 5737504, "step": 5700 }, { "epoch": 2.68976897689769, "grad_norm": 0.6608368158340454, "learning_rate": 4.981930160864869e-05, "loss": 0.1904, "num_input_tokens_seen": 5741600, "step": 5705 }, { "epoch": 2.692126355492692, "grad_norm": 0.17354081571102142, "learning_rate": 4.981806503190318e-05, "loss": 0.0432, "num_input_tokens_seen": 5746528, "step": 5710 }, { "epoch": 2.6944837340876946, "grad_norm": 0.5675763487815857, "learning_rate": 4.9816824253880636e-05, "loss": 0.1403, "num_input_tokens_seen": 5751040, "step": 5715 }, { "epoch": 2.696841112682697, "grad_norm": 1.8711655139923096, "learning_rate": 4.9815579274791116e-05, "loss": 0.2384, "num_input_tokens_seen": 5756736, "step": 5720 }, { "epoch": 2.6991984912776994, "grad_norm": 1.1326158046722412, "learning_rate": 4.9814330094845366e-05, "loss": 0.2188, "num_input_tokens_seen": 5761376, "step": 5725 }, { "epoch": 2.701555869872702, "grad_norm": 1.5031707286834717, "learning_rate": 4.981307671425485e-05, "loss": 0.1931, "num_input_tokens_seen": 5766272, "step": 5730 }, { "epoch": 2.7039132484677038, "grad_norm": 1.9527407884597778, "learning_rate": 4.981181913323175e-05, "loss": 0.2924, "num_input_tokens_seen": 5770944, "step": 5735 }, { "epoch": 2.706270627062706, "grad_norm": 0.5698434114456177, "learning_rate": 4.9810557351988955e-05, "loss": 0.0651, "num_input_tokens_seen": 5776288, "step": 5740 }, { "epoch": 2.7086280056577086, "grad_norm": 1.8583072423934937, "learning_rate": 4.9809291370740066e-05, "loss": 0.2447, "num_input_tokens_seen": 5780544, "step": 5745 }, { "epoch": 2.710985384252711, "grad_norm": 0.1630517989397049, "learning_rate": 4.980802118969937e-05, "loss": 0.2215, "num_input_tokens_seen": 5785600, "step": 5750 }, { "epoch": 2.7133427628477134, "grad_norm": 0.17575812339782715, "learning_rate": 4.980674680908192e-05, "loss": 0.0786, "num_input_tokens_seen": 5790080, "step": 5755 }, { "epoch": 2.7157001414427158, "grad_norm": 0.134353369474411, "learning_rate": 4.9805468229103414e-05, "loss": 0.1321, "num_input_tokens_seen": 5794368, "step": 5760 }, { "epoch": 2.718057520037718, "grad_norm": 2.7103121280670166, "learning_rate": 4.980418544998032e-05, "loss": 0.4667, "num_input_tokens_seen": 5798784, "step": 5765 }, { "epoch": 2.7204148986327206, "grad_norm": 0.6474967002868652, "learning_rate": 4.980289847192978e-05, "loss": 0.0542, "num_input_tokens_seen": 5803456, "step": 5770 }, { "epoch": 2.7227722772277225, "grad_norm": 0.9586413502693176, "learning_rate": 4.980160729516965e-05, "loss": 0.1967, "num_input_tokens_seen": 5809088, "step": 5775 }, { "epoch": 2.725129655822725, "grad_norm": 1.4417084455490112, "learning_rate": 4.980031191991852e-05, "loss": 0.1306, "num_input_tokens_seen": 5815488, "step": 5780 }, { "epoch": 2.7274870344177273, "grad_norm": 0.7024586796760559, "learning_rate": 4.979901234639567e-05, "loss": 0.1076, "num_input_tokens_seen": 5820512, "step": 5785 }, { "epoch": 2.7298444130127297, "grad_norm": 1.143248438835144, "learning_rate": 4.979770857482108e-05, "loss": 0.1246, "num_input_tokens_seen": 5824864, "step": 5790 }, { "epoch": 2.732201791607732, "grad_norm": 0.30783507227897644, "learning_rate": 4.979640060541548e-05, "loss": 0.1315, "num_input_tokens_seen": 5829312, "step": 5795 }, { "epoch": 2.7345591702027345, "grad_norm": 0.14111287891864777, "learning_rate": 4.979508843840027e-05, "loss": 0.1425, "num_input_tokens_seen": 5833824, "step": 5800 }, { "epoch": 2.736916548797737, "grad_norm": 0.9827951192855835, "learning_rate": 4.979377207399759e-05, "loss": 0.0716, "num_input_tokens_seen": 5839488, "step": 5805 }, { "epoch": 2.7392739273927393, "grad_norm": 1.7693685293197632, "learning_rate": 4.979245151243027e-05, "loss": 0.1054, "num_input_tokens_seen": 5844288, "step": 5810 }, { "epoch": 2.7416313059877417, "grad_norm": 0.7028456926345825, "learning_rate": 4.9791126753921864e-05, "loss": 0.103, "num_input_tokens_seen": 5848736, "step": 5815 }, { "epoch": 2.743988684582744, "grad_norm": 3.2215073108673096, "learning_rate": 4.9789797798696627e-05, "loss": 0.3742, "num_input_tokens_seen": 5853856, "step": 5820 }, { "epoch": 2.7463460631777465, "grad_norm": 0.24958527088165283, "learning_rate": 4.978846464697953e-05, "loss": 0.1089, "num_input_tokens_seen": 5859712, "step": 5825 }, { "epoch": 2.748703441772749, "grad_norm": 0.3488904535770416, "learning_rate": 4.978712729899624e-05, "loss": 0.1408, "num_input_tokens_seen": 5864448, "step": 5830 }, { "epoch": 2.751060820367751, "grad_norm": 1.9230544567108154, "learning_rate": 4.978578575497317e-05, "loss": 0.0986, "num_input_tokens_seen": 5869408, "step": 5835 }, { "epoch": 2.7534181989627533, "grad_norm": 0.8793212175369263, "learning_rate": 4.9784440015137414e-05, "loss": 0.1857, "num_input_tokens_seen": 5874816, "step": 5840 }, { "epoch": 2.7557755775577557, "grad_norm": 1.302626132965088, "learning_rate": 4.978309007971678e-05, "loss": 0.2596, "num_input_tokens_seen": 5881536, "step": 5845 }, { "epoch": 2.758132956152758, "grad_norm": 0.35905274748802185, "learning_rate": 4.978173594893978e-05, "loss": 0.097, "num_input_tokens_seen": 5886336, "step": 5850 }, { "epoch": 2.7604903347477605, "grad_norm": 0.4369804263114929, "learning_rate": 4.978037762303566e-05, "loss": 0.2139, "num_input_tokens_seen": 5891200, "step": 5855 }, { "epoch": 2.762847713342763, "grad_norm": 1.6337482929229736, "learning_rate": 4.977901510223435e-05, "loss": 0.1234, "num_input_tokens_seen": 5896128, "step": 5860 }, { "epoch": 2.7652050919377653, "grad_norm": 0.8848215937614441, "learning_rate": 4.9777648386766506e-05, "loss": 0.0868, "num_input_tokens_seen": 5900480, "step": 5865 }, { "epoch": 2.7675624705327677, "grad_norm": 0.08713704347610474, "learning_rate": 4.977627747686349e-05, "loss": 0.0693, "num_input_tokens_seen": 5905120, "step": 5870 }, { "epoch": 2.7699198491277697, "grad_norm": 0.24353694915771484, "learning_rate": 4.977490237275738e-05, "loss": 0.0877, "num_input_tokens_seen": 5909984, "step": 5875 }, { "epoch": 2.772277227722772, "grad_norm": 1.2105361223220825, "learning_rate": 4.977352307468095e-05, "loss": 0.0842, "num_input_tokens_seen": 5914848, "step": 5880 }, { "epoch": 2.7746346063177745, "grad_norm": 1.1118501424789429, "learning_rate": 4.9772139582867695e-05, "loss": 0.3234, "num_input_tokens_seen": 5919872, "step": 5885 }, { "epoch": 2.776991984912777, "grad_norm": 0.333951473236084, "learning_rate": 4.9770751897551816e-05, "loss": 0.1614, "num_input_tokens_seen": 5924672, "step": 5890 }, { "epoch": 2.7793493635077793, "grad_norm": 0.48063915967941284, "learning_rate": 4.976936001896823e-05, "loss": 0.1484, "num_input_tokens_seen": 5930496, "step": 5895 }, { "epoch": 2.7817067421027817, "grad_norm": 0.7630500197410583, "learning_rate": 4.976796394735254e-05, "loss": 0.2859, "num_input_tokens_seen": 5935808, "step": 5900 }, { "epoch": 2.784064120697784, "grad_norm": 1.3064689636230469, "learning_rate": 4.9766563682941094e-05, "loss": 0.2071, "num_input_tokens_seen": 5940352, "step": 5905 }, { "epoch": 2.7864214992927865, "grad_norm": 0.6247557997703552, "learning_rate": 4.976515922597094e-05, "loss": 0.3465, "num_input_tokens_seen": 5945184, "step": 5910 }, { "epoch": 2.788778877887789, "grad_norm": 1.3150755167007446, "learning_rate": 4.97637505766798e-05, "loss": 0.2187, "num_input_tokens_seen": 5950208, "step": 5915 }, { "epoch": 2.7911362564827913, "grad_norm": 0.8537529706954956, "learning_rate": 4.976233773530616e-05, "loss": 0.2417, "num_input_tokens_seen": 5956224, "step": 5920 }, { "epoch": 2.7934936350777937, "grad_norm": 1.7209465503692627, "learning_rate": 4.9760920702089186e-05, "loss": 0.2026, "num_input_tokens_seen": 5961408, "step": 5925 }, { "epoch": 2.795851013672796, "grad_norm": 0.30912864208221436, "learning_rate": 4.9759499477268745e-05, "loss": 0.1109, "num_input_tokens_seen": 5966144, "step": 5930 }, { "epoch": 2.798208392267798, "grad_norm": 2.08950138092041, "learning_rate": 4.9758074061085444e-05, "loss": 0.1546, "num_input_tokens_seen": 5971712, "step": 5935 }, { "epoch": 2.8005657708628005, "grad_norm": 0.7666255235671997, "learning_rate": 4.9756644453780565e-05, "loss": 0.2747, "num_input_tokens_seen": 5976736, "step": 5940 }, { "epoch": 2.802923149457803, "grad_norm": 1.0750664472579956, "learning_rate": 4.975521065559613e-05, "loss": 0.2302, "num_input_tokens_seen": 5982176, "step": 5945 }, { "epoch": 2.8052805280528053, "grad_norm": 1.095090389251709, "learning_rate": 4.9753772666774844e-05, "loss": 0.2952, "num_input_tokens_seen": 5987072, "step": 5950 }, { "epoch": 2.8076379066478077, "grad_norm": 2.717282772064209, "learning_rate": 4.9752330487560144e-05, "loss": 0.132, "num_input_tokens_seen": 5992736, "step": 5955 }, { "epoch": 2.80999528524281, "grad_norm": 0.4601496160030365, "learning_rate": 4.9750884118196165e-05, "loss": 0.1054, "num_input_tokens_seen": 5998464, "step": 5960 }, { "epoch": 2.8123526638378125, "grad_norm": 0.3220534324645996, "learning_rate": 4.974943355892775e-05, "loss": 0.0567, "num_input_tokens_seen": 6003584, "step": 5965 }, { "epoch": 2.814710042432815, "grad_norm": 0.6899735331535339, "learning_rate": 4.974797881000045e-05, "loss": 0.1338, "num_input_tokens_seen": 6008160, "step": 5970 }, { "epoch": 2.817067421027817, "grad_norm": 0.3868018090724945, "learning_rate": 4.974651987166054e-05, "loss": 0.1188, "num_input_tokens_seen": 6012864, "step": 5975 }, { "epoch": 2.8194247996228192, "grad_norm": 0.3911474943161011, "learning_rate": 4.974505674415498e-05, "loss": 0.0551, "num_input_tokens_seen": 6018176, "step": 5980 }, { "epoch": 2.8217821782178216, "grad_norm": 0.04804737865924835, "learning_rate": 4.974358942773147e-05, "loss": 0.2008, "num_input_tokens_seen": 6024192, "step": 5985 }, { "epoch": 2.824139556812824, "grad_norm": 0.30872565507888794, "learning_rate": 4.974211792263839e-05, "loss": 0.0746, "num_input_tokens_seen": 6028960, "step": 5990 }, { "epoch": 2.8264969354078264, "grad_norm": 0.6858574748039246, "learning_rate": 4.974064222912484e-05, "loss": 0.2703, "num_input_tokens_seen": 6034464, "step": 5995 }, { "epoch": 2.828854314002829, "grad_norm": 0.2871069610118866, "learning_rate": 4.973916234744064e-05, "loss": 0.1023, "num_input_tokens_seen": 6039296, "step": 6000 }, { "epoch": 2.8312116925978312, "grad_norm": 1.9601812362670898, "learning_rate": 4.973767827783629e-05, "loss": 0.1123, "num_input_tokens_seen": 6043744, "step": 6005 }, { "epoch": 2.8335690711928336, "grad_norm": 0.42491576075553894, "learning_rate": 4.973619002056303e-05, "loss": 0.021, "num_input_tokens_seen": 6048384, "step": 6010 }, { "epoch": 2.835926449787836, "grad_norm": 1.2674460411071777, "learning_rate": 4.9734697575872813e-05, "loss": 0.1344, "num_input_tokens_seen": 6053824, "step": 6015 }, { "epoch": 2.8382838283828384, "grad_norm": 9.323615074157715, "learning_rate": 4.9733200944018254e-05, "loss": 0.2385, "num_input_tokens_seen": 6058592, "step": 6020 }, { "epoch": 2.840641206977841, "grad_norm": 0.26452964544296265, "learning_rate": 4.9731700125252724e-05, "loss": 0.1833, "num_input_tokens_seen": 6063680, "step": 6025 }, { "epoch": 2.8429985855728432, "grad_norm": 2.128523111343384, "learning_rate": 4.973019511983028e-05, "loss": 0.3052, "num_input_tokens_seen": 6069056, "step": 6030 }, { "epoch": 2.845355964167845, "grad_norm": 0.408285915851593, "learning_rate": 4.97286859280057e-05, "loss": 0.0816, "num_input_tokens_seen": 6075168, "step": 6035 }, { "epoch": 2.8477133427628476, "grad_norm": 1.2070019245147705, "learning_rate": 4.972717255003446e-05, "loss": 0.1032, "num_input_tokens_seen": 6081728, "step": 6040 }, { "epoch": 2.85007072135785, "grad_norm": 0.7196281552314758, "learning_rate": 4.972565498617276e-05, "loss": 0.1496, "num_input_tokens_seen": 6086592, "step": 6045 }, { "epoch": 2.8524280999528524, "grad_norm": 0.23898158967494965, "learning_rate": 4.9724133236677476e-05, "loss": 0.3393, "num_input_tokens_seen": 6091008, "step": 6050 }, { "epoch": 2.854785478547855, "grad_norm": 0.9336380362510681, "learning_rate": 4.9722607301806224e-05, "loss": 0.2521, "num_input_tokens_seen": 6096928, "step": 6055 }, { "epoch": 2.857142857142857, "grad_norm": 0.48854854702949524, "learning_rate": 4.9721077181817334e-05, "loss": 0.1734, "num_input_tokens_seen": 6101888, "step": 6060 }, { "epoch": 2.8595002357378596, "grad_norm": 0.5615476369857788, "learning_rate": 4.971954287696981e-05, "loss": 0.0548, "num_input_tokens_seen": 6107680, "step": 6065 }, { "epoch": 2.861857614332862, "grad_norm": 0.7800881862640381, "learning_rate": 4.971800438752339e-05, "loss": 0.1269, "num_input_tokens_seen": 6112608, "step": 6070 }, { "epoch": 2.864214992927864, "grad_norm": 0.8081766963005066, "learning_rate": 4.971646171373852e-05, "loss": 0.0706, "num_input_tokens_seen": 6117536, "step": 6075 }, { "epoch": 2.8665723715228664, "grad_norm": 0.933901846408844, "learning_rate": 4.9714914855876336e-05, "loss": 0.2016, "num_input_tokens_seen": 6122400, "step": 6080 }, { "epoch": 2.8689297501178688, "grad_norm": 1.057425856590271, "learning_rate": 4.971336381419871e-05, "loss": 0.1609, "num_input_tokens_seen": 6126976, "step": 6085 }, { "epoch": 2.871287128712871, "grad_norm": 1.1764169931411743, "learning_rate": 4.97118085889682e-05, "loss": 0.1109, "num_input_tokens_seen": 6132704, "step": 6090 }, { "epoch": 2.8736445073078736, "grad_norm": 0.028203202411532402, "learning_rate": 4.9710249180448075e-05, "loss": 0.159, "num_input_tokens_seen": 6137024, "step": 6095 }, { "epoch": 2.876001885902876, "grad_norm": 0.23797598481178284, "learning_rate": 4.970868558890232e-05, "loss": 0.1284, "num_input_tokens_seen": 6142784, "step": 6100 }, { "epoch": 2.8783592644978784, "grad_norm": 0.6478725075721741, "learning_rate": 4.970711781459563e-05, "loss": 0.2579, "num_input_tokens_seen": 6148672, "step": 6105 }, { "epoch": 2.880716643092881, "grad_norm": 1.0355982780456543, "learning_rate": 4.9705545857793386e-05, "loss": 0.1125, "num_input_tokens_seen": 6154208, "step": 6110 }, { "epoch": 2.883074021687883, "grad_norm": 0.5159767270088196, "learning_rate": 4.970396971876171e-05, "loss": 0.2309, "num_input_tokens_seen": 6159232, "step": 6115 }, { "epoch": 2.8854314002828856, "grad_norm": 1.0773297548294067, "learning_rate": 4.9702389397767414e-05, "loss": 0.0857, "num_input_tokens_seen": 6164768, "step": 6120 }, { "epoch": 2.887788778877888, "grad_norm": 2.312067985534668, "learning_rate": 4.970080489507801e-05, "loss": 0.4121, "num_input_tokens_seen": 6169248, "step": 6125 }, { "epoch": 2.8901461574728904, "grad_norm": 0.31164056062698364, "learning_rate": 4.969921621096174e-05, "loss": 0.0831, "num_input_tokens_seen": 6174016, "step": 6130 }, { "epoch": 2.8925035360678923, "grad_norm": 3.8971292972564697, "learning_rate": 4.969762334568753e-05, "loss": 0.229, "num_input_tokens_seen": 6179616, "step": 6135 }, { "epoch": 2.8948609146628947, "grad_norm": 0.3805057406425476, "learning_rate": 4.9696026299525024e-05, "loss": 0.1896, "num_input_tokens_seen": 6184352, "step": 6140 }, { "epoch": 2.897218293257897, "grad_norm": 1.5455052852630615, "learning_rate": 4.969442507274459e-05, "loss": 0.2158, "num_input_tokens_seen": 6188608, "step": 6145 }, { "epoch": 2.8995756718528995, "grad_norm": 0.9357823729515076, "learning_rate": 4.9692819665617265e-05, "loss": 0.2206, "num_input_tokens_seen": 6193440, "step": 6150 }, { "epoch": 2.901933050447902, "grad_norm": 1.1831523180007935, "learning_rate": 4.969121007841484e-05, "loss": 0.0911, "num_input_tokens_seen": 6199296, "step": 6155 }, { "epoch": 2.9042904290429044, "grad_norm": 0.2551688253879547, "learning_rate": 4.968959631140978e-05, "loss": 0.048, "num_input_tokens_seen": 6204800, "step": 6160 }, { "epoch": 2.9066478076379068, "grad_norm": 0.1623733788728714, "learning_rate": 4.9687978364875274e-05, "loss": 0.1502, "num_input_tokens_seen": 6210144, "step": 6165 }, { "epoch": 2.909005186232909, "grad_norm": 1.4655237197875977, "learning_rate": 4.9686356239085206e-05, "loss": 0.1546, "num_input_tokens_seen": 6215040, "step": 6170 }, { "epoch": 2.911362564827911, "grad_norm": 1.5319360494613647, "learning_rate": 4.968472993431418e-05, "loss": 0.2391, "num_input_tokens_seen": 6219488, "step": 6175 }, { "epoch": 2.9137199434229135, "grad_norm": 0.3433821201324463, "learning_rate": 4.96830994508375e-05, "loss": 0.1847, "num_input_tokens_seen": 6224320, "step": 6180 }, { "epoch": 2.916077322017916, "grad_norm": 0.6286081075668335, "learning_rate": 4.9681464788931166e-05, "loss": 0.0577, "num_input_tokens_seen": 6230272, "step": 6185 }, { "epoch": 2.9184347006129183, "grad_norm": 0.3188633322715759, "learning_rate": 4.967982594887192e-05, "loss": 0.2073, "num_input_tokens_seen": 6234784, "step": 6190 }, { "epoch": 2.9207920792079207, "grad_norm": 1.0685255527496338, "learning_rate": 4.967818293093718e-05, "loss": 0.188, "num_input_tokens_seen": 6239904, "step": 6195 }, { "epoch": 2.923149457802923, "grad_norm": 1.4694334268569946, "learning_rate": 4.9676535735405084e-05, "loss": 0.2653, "num_input_tokens_seen": 6245344, "step": 6200 }, { "epoch": 2.9255068363979255, "grad_norm": 0.6601101160049438, "learning_rate": 4.9674884362554466e-05, "loss": 0.1254, "num_input_tokens_seen": 6249440, "step": 6205 }, { "epoch": 2.927864214992928, "grad_norm": 0.9931111931800842, "learning_rate": 4.9673228812664884e-05, "loss": 0.2433, "num_input_tokens_seen": 6254432, "step": 6210 }, { "epoch": 2.9302215935879303, "grad_norm": 0.3793575167655945, "learning_rate": 4.967156908601659e-05, "loss": 0.0563, "num_input_tokens_seen": 6259520, "step": 6215 }, { "epoch": 2.9325789721829327, "grad_norm": 0.195034459233284, "learning_rate": 4.966990518289055e-05, "loss": 0.1161, "num_input_tokens_seen": 6264448, "step": 6220 }, { "epoch": 2.934936350777935, "grad_norm": 0.20141032338142395, "learning_rate": 4.966823710356844e-05, "loss": 0.0765, "num_input_tokens_seen": 6269664, "step": 6225 }, { "epoch": 2.9372937293729375, "grad_norm": 0.3440210521221161, "learning_rate": 4.966656484833262e-05, "loss": 0.1116, "num_input_tokens_seen": 6274400, "step": 6230 }, { "epoch": 2.9396511079679395, "grad_norm": 0.8022878766059875, "learning_rate": 4.966488841746619e-05, "loss": 0.3109, "num_input_tokens_seen": 6279296, "step": 6235 }, { "epoch": 2.942008486562942, "grad_norm": 1.1034272909164429, "learning_rate": 4.9663207811252936e-05, "loss": 0.2476, "num_input_tokens_seen": 6284224, "step": 6240 }, { "epoch": 2.9443658651579443, "grad_norm": 1.214102864265442, "learning_rate": 4.9661523029977365e-05, "loss": 0.0857, "num_input_tokens_seen": 6288512, "step": 6245 }, { "epoch": 2.9467232437529467, "grad_norm": 0.9149835705757141, "learning_rate": 4.965983407392466e-05, "loss": 0.1525, "num_input_tokens_seen": 6294688, "step": 6250 }, { "epoch": 2.949080622347949, "grad_norm": 0.3460152745246887, "learning_rate": 4.965814094338076e-05, "loss": 0.0602, "num_input_tokens_seen": 6299392, "step": 6255 }, { "epoch": 2.9514380009429515, "grad_norm": 0.9317647814750671, "learning_rate": 4.965644363863226e-05, "loss": 0.1794, "num_input_tokens_seen": 6304160, "step": 6260 }, { "epoch": 2.953795379537954, "grad_norm": 0.5089988112449646, "learning_rate": 4.96547421599665e-05, "loss": 0.1713, "num_input_tokens_seen": 6311136, "step": 6265 }, { "epoch": 2.9561527581329563, "grad_norm": 0.3070124685764313, "learning_rate": 4.96530365076715e-05, "loss": 0.1328, "num_input_tokens_seen": 6315264, "step": 6270 }, { "epoch": 2.9585101367279583, "grad_norm": 0.05697769299149513, "learning_rate": 4.965132668203601e-05, "loss": 0.2936, "num_input_tokens_seen": 6320192, "step": 6275 }, { "epoch": 2.9608675153229607, "grad_norm": 0.09325933456420898, "learning_rate": 4.964961268334947e-05, "loss": 0.1535, "num_input_tokens_seen": 6324864, "step": 6280 }, { "epoch": 2.963224893917963, "grad_norm": 0.7439942955970764, "learning_rate": 4.9647894511902024e-05, "loss": 0.181, "num_input_tokens_seen": 6329536, "step": 6285 }, { "epoch": 2.9655822725129655, "grad_norm": 1.5181961059570312, "learning_rate": 4.964617216798454e-05, "loss": 0.1445, "num_input_tokens_seen": 6335424, "step": 6290 }, { "epoch": 2.967939651107968, "grad_norm": 0.5875119566917419, "learning_rate": 4.964444565188857e-05, "loss": 0.0892, "num_input_tokens_seen": 6340160, "step": 6295 }, { "epoch": 2.9702970297029703, "grad_norm": 2.6316261291503906, "learning_rate": 4.964271496390639e-05, "loss": 0.2841, "num_input_tokens_seen": 6344288, "step": 6300 }, { "epoch": 2.9726544082979727, "grad_norm": 2.0354831218719482, "learning_rate": 4.964098010433098e-05, "loss": 0.1226, "num_input_tokens_seen": 6349088, "step": 6305 }, { "epoch": 2.975011786892975, "grad_norm": 0.2115744650363922, "learning_rate": 4.963924107345602e-05, "loss": 0.0651, "num_input_tokens_seen": 6353248, "step": 6310 }, { "epoch": 2.9773691654879775, "grad_norm": 0.8128047585487366, "learning_rate": 4.963749787157589e-05, "loss": 0.0921, "num_input_tokens_seen": 6359200, "step": 6315 }, { "epoch": 2.97972654408298, "grad_norm": 0.0252896249294281, "learning_rate": 4.96357504989857e-05, "loss": 0.0909, "num_input_tokens_seen": 6364448, "step": 6320 }, { "epoch": 2.9820839226779823, "grad_norm": 2.1694464683532715, "learning_rate": 4.963399895598123e-05, "loss": 0.2525, "num_input_tokens_seen": 6368928, "step": 6325 }, { "epoch": 2.9844413012729847, "grad_norm": 1.6588672399520874, "learning_rate": 4.963224324285901e-05, "loss": 0.2307, "num_input_tokens_seen": 6374016, "step": 6330 }, { "epoch": 2.9867986798679866, "grad_norm": 0.6983335614204407, "learning_rate": 4.963048335991624e-05, "loss": 0.2064, "num_input_tokens_seen": 6378816, "step": 6335 }, { "epoch": 2.989156058462989, "grad_norm": 0.2243284434080124, "learning_rate": 4.9628719307450835e-05, "loss": 0.1297, "num_input_tokens_seen": 6384416, "step": 6340 }, { "epoch": 2.9915134370579914, "grad_norm": 0.765703558921814, "learning_rate": 4.962695108576142e-05, "loss": 0.2136, "num_input_tokens_seen": 6388768, "step": 6345 }, { "epoch": 2.993870815652994, "grad_norm": 0.912735104560852, "learning_rate": 4.962517869514733e-05, "loss": 0.1228, "num_input_tokens_seen": 6393312, "step": 6350 }, { "epoch": 2.9962281942479962, "grad_norm": 1.2674709558486938, "learning_rate": 4.96234021359086e-05, "loss": 0.1204, "num_input_tokens_seen": 6397728, "step": 6355 }, { "epoch": 2.9985855728429986, "grad_norm": 1.0558853149414062, "learning_rate": 4.962162140834596e-05, "loss": 0.1143, "num_input_tokens_seen": 6402080, "step": 6360 }, { "epoch": 3.0, "eval_loss": 0.17026710510253906, "eval_runtime": 15.1664, "eval_samples_per_second": 62.177, "eval_steps_per_second": 15.561, "num_input_tokens_seen": 6404896, "step": 6363 }, { "epoch": 3.000942951438001, "grad_norm": 1.6644254922866821, "learning_rate": 4.961983651276089e-05, "loss": 0.152, "num_input_tokens_seen": 6407104, "step": 6365 }, { "epoch": 3.0033003300330035, "grad_norm": 0.9208534955978394, "learning_rate": 4.9618047449455496e-05, "loss": 0.4126, "num_input_tokens_seen": 6411808, "step": 6370 }, { "epoch": 3.005657708628006, "grad_norm": 1.489980936050415, "learning_rate": 4.9616254218732674e-05, "loss": 0.1451, "num_input_tokens_seen": 6418912, "step": 6375 }, { "epoch": 3.008015087223008, "grad_norm": 1.8186463117599487, "learning_rate": 4.9614456820895974e-05, "loss": 0.2506, "num_input_tokens_seen": 6424864, "step": 6380 }, { "epoch": 3.01037246581801, "grad_norm": 0.6382980942726135, "learning_rate": 4.961265525624965e-05, "loss": 0.1241, "num_input_tokens_seen": 6430496, "step": 6385 }, { "epoch": 3.0127298444130126, "grad_norm": 0.39655762910842896, "learning_rate": 4.96108495250987e-05, "loss": 0.2899, "num_input_tokens_seen": 6436160, "step": 6390 }, { "epoch": 3.015087223008015, "grad_norm": 1.0776886940002441, "learning_rate": 4.9609039627748794e-05, "loss": 0.1618, "num_input_tokens_seen": 6444320, "step": 6395 }, { "epoch": 3.0174446016030174, "grad_norm": 0.2797795236110687, "learning_rate": 4.960722556450631e-05, "loss": 0.1042, "num_input_tokens_seen": 6449120, "step": 6400 }, { "epoch": 3.01980198019802, "grad_norm": 0.8073962926864624, "learning_rate": 4.9605407335678354e-05, "loss": 0.1729, "num_input_tokens_seen": 6453248, "step": 6405 }, { "epoch": 3.022159358793022, "grad_norm": 0.30400151014328003, "learning_rate": 4.9603584941572704e-05, "loss": 0.0778, "num_input_tokens_seen": 6457664, "step": 6410 }, { "epoch": 3.0245167373880246, "grad_norm": 1.4982370138168335, "learning_rate": 4.9601758382497875e-05, "loss": 0.0968, "num_input_tokens_seen": 6463520, "step": 6415 }, { "epoch": 3.026874115983027, "grad_norm": 2.068755626678467, "learning_rate": 4.959992765876306e-05, "loss": 0.4115, "num_input_tokens_seen": 6468672, "step": 6420 }, { "epoch": 3.0292314945780294, "grad_norm": 0.31414398550987244, "learning_rate": 4.9598092770678186e-05, "loss": 0.1014, "num_input_tokens_seen": 6474048, "step": 6425 }, { "epoch": 3.0315888731730314, "grad_norm": 1.071690320968628, "learning_rate": 4.959625371855384e-05, "loss": 0.1433, "num_input_tokens_seen": 6479936, "step": 6430 }, { "epoch": 3.033946251768034, "grad_norm": 0.9132258296012878, "learning_rate": 4.959441050270138e-05, "loss": 0.0506, "num_input_tokens_seen": 6486528, "step": 6435 }, { "epoch": 3.036303630363036, "grad_norm": 0.7504556179046631, "learning_rate": 4.95925631234328e-05, "loss": 0.0895, "num_input_tokens_seen": 6492928, "step": 6440 }, { "epoch": 3.0386610089580386, "grad_norm": 1.7332508563995361, "learning_rate": 4.9590711581060844e-05, "loss": 0.0746, "num_input_tokens_seen": 6496992, "step": 6445 }, { "epoch": 3.041018387553041, "grad_norm": 1.2376347780227661, "learning_rate": 4.958885587589894e-05, "loss": 0.332, "num_input_tokens_seen": 6502368, "step": 6450 }, { "epoch": 3.0433757661480434, "grad_norm": 2.5828685760498047, "learning_rate": 4.9586996008261235e-05, "loss": 0.4023, "num_input_tokens_seen": 6507840, "step": 6455 }, { "epoch": 3.045733144743046, "grad_norm": 0.9020931124687195, "learning_rate": 4.958513197846257e-05, "loss": 0.2802, "num_input_tokens_seen": 6512832, "step": 6460 }, { "epoch": 3.048090523338048, "grad_norm": 1.242229700088501, "learning_rate": 4.958326378681849e-05, "loss": 0.2499, "num_input_tokens_seen": 6518400, "step": 6465 }, { "epoch": 3.0504479019330506, "grad_norm": 1.4036555290222168, "learning_rate": 4.9581391433645254e-05, "loss": 0.3188, "num_input_tokens_seen": 6523200, "step": 6470 }, { "epoch": 3.052805280528053, "grad_norm": 1.4532557725906372, "learning_rate": 4.9579514919259806e-05, "loss": 0.1879, "num_input_tokens_seen": 6527904, "step": 6475 }, { "epoch": 3.055162659123055, "grad_norm": 0.050382956862449646, "learning_rate": 4.957763424397983e-05, "loss": 0.0725, "num_input_tokens_seen": 6532192, "step": 6480 }, { "epoch": 3.0575200377180574, "grad_norm": 0.2961033284664154, "learning_rate": 4.957574940812368e-05, "loss": 0.3633, "num_input_tokens_seen": 6537216, "step": 6485 }, { "epoch": 3.0598774163130598, "grad_norm": 0.20893120765686035, "learning_rate": 4.9573860412010426e-05, "loss": 0.0302, "num_input_tokens_seen": 6541664, "step": 6490 }, { "epoch": 3.062234794908062, "grad_norm": 0.16807091236114502, "learning_rate": 4.957196725595985e-05, "loss": 0.1029, "num_input_tokens_seen": 6547008, "step": 6495 }, { "epoch": 3.0645921735030646, "grad_norm": 0.31169891357421875, "learning_rate": 4.957006994029241e-05, "loss": 0.2329, "num_input_tokens_seen": 6551552, "step": 6500 }, { "epoch": 3.066949552098067, "grad_norm": 0.10149665921926498, "learning_rate": 4.9568168465329317e-05, "loss": 0.4256, "num_input_tokens_seen": 6556640, "step": 6505 }, { "epoch": 3.0693069306930694, "grad_norm": 3.3383772373199463, "learning_rate": 4.9566262831392446e-05, "loss": 0.232, "num_input_tokens_seen": 6561152, "step": 6510 }, { "epoch": 3.0716643092880718, "grad_norm": 1.2268272638320923, "learning_rate": 4.956435303880439e-05, "loss": 0.2633, "num_input_tokens_seen": 6566560, "step": 6515 }, { "epoch": 3.074021687883074, "grad_norm": 1.6338013410568237, "learning_rate": 4.956243908788844e-05, "loss": 0.1973, "num_input_tokens_seen": 6571680, "step": 6520 }, { "epoch": 3.0763790664780766, "grad_norm": 0.45250725746154785, "learning_rate": 4.95605209789686e-05, "loss": 0.0817, "num_input_tokens_seen": 6576032, "step": 6525 }, { "epoch": 3.0787364450730785, "grad_norm": 0.6546145677566528, "learning_rate": 4.9558598712369574e-05, "loss": 0.2144, "num_input_tokens_seen": 6581280, "step": 6530 }, { "epoch": 3.081093823668081, "grad_norm": 1.4691271781921387, "learning_rate": 4.955667228841676e-05, "loss": 0.1426, "num_input_tokens_seen": 6585696, "step": 6535 }, { "epoch": 3.0834512022630833, "grad_norm": 0.3814021646976471, "learning_rate": 4.955474170743629e-05, "loss": 0.1965, "num_input_tokens_seen": 6591104, "step": 6540 }, { "epoch": 3.0858085808580857, "grad_norm": 0.7699352502822876, "learning_rate": 4.955280696975495e-05, "loss": 0.1529, "num_input_tokens_seen": 6596448, "step": 6545 }, { "epoch": 3.088165959453088, "grad_norm": 1.1858035326004028, "learning_rate": 4.955086807570028e-05, "loss": 0.093, "num_input_tokens_seen": 6600256, "step": 6550 }, { "epoch": 3.0905233380480905, "grad_norm": 0.7870771288871765, "learning_rate": 4.95489250256005e-05, "loss": 0.2642, "num_input_tokens_seen": 6605792, "step": 6555 }, { "epoch": 3.092880716643093, "grad_norm": 0.8325827121734619, "learning_rate": 4.954697781978452e-05, "loss": 0.1107, "num_input_tokens_seen": 6610048, "step": 6560 }, { "epoch": 3.0952380952380953, "grad_norm": 0.3221447169780731, "learning_rate": 4.954502645858199e-05, "loss": 0.0651, "num_input_tokens_seen": 6614848, "step": 6565 }, { "epoch": 3.0975954738330977, "grad_norm": 0.4705396890640259, "learning_rate": 4.954307094232322e-05, "loss": 0.0848, "num_input_tokens_seen": 6620256, "step": 6570 }, { "epoch": 3.0999528524281, "grad_norm": 0.3881877064704895, "learning_rate": 4.954111127133926e-05, "loss": 0.1435, "num_input_tokens_seen": 6625440, "step": 6575 }, { "epoch": 3.102310231023102, "grad_norm": 0.5355534553527832, "learning_rate": 4.9539147445961854e-05, "loss": 0.0955, "num_input_tokens_seen": 6631200, "step": 6580 }, { "epoch": 3.1046676096181045, "grad_norm": 0.36157095432281494, "learning_rate": 4.9537179466523426e-05, "loss": 0.1351, "num_input_tokens_seen": 6636160, "step": 6585 }, { "epoch": 3.107024988213107, "grad_norm": 1.109520435333252, "learning_rate": 4.9535207333357134e-05, "loss": 0.2229, "num_input_tokens_seen": 6641792, "step": 6590 }, { "epoch": 3.1093823668081093, "grad_norm": 1.3504745960235596, "learning_rate": 4.953323104679683e-05, "loss": 0.1007, "num_input_tokens_seen": 6646656, "step": 6595 }, { "epoch": 3.1117397454031117, "grad_norm": 2.771028518676758, "learning_rate": 4.953125060717706e-05, "loss": 0.2019, "num_input_tokens_seen": 6651168, "step": 6600 }, { "epoch": 3.114097123998114, "grad_norm": 0.05644705891609192, "learning_rate": 4.952926601483308e-05, "loss": 0.0616, "num_input_tokens_seen": 6655744, "step": 6605 }, { "epoch": 3.1164545025931165, "grad_norm": 0.09213351458311081, "learning_rate": 4.9527277270100836e-05, "loss": 0.0837, "num_input_tokens_seen": 6661536, "step": 6610 }, { "epoch": 3.118811881188119, "grad_norm": 1.2799209356307983, "learning_rate": 4.952528437331701e-05, "loss": 0.1264, "num_input_tokens_seen": 6665728, "step": 6615 }, { "epoch": 3.1211692597831213, "grad_norm": 0.31689032912254333, "learning_rate": 4.952328732481895e-05, "loss": 0.1449, "num_input_tokens_seen": 6670688, "step": 6620 }, { "epoch": 3.1235266383781237, "grad_norm": 0.8524962067604065, "learning_rate": 4.952128612494473e-05, "loss": 0.2267, "num_input_tokens_seen": 6674816, "step": 6625 }, { "epoch": 3.1258840169731257, "grad_norm": 0.5077894926071167, "learning_rate": 4.951928077403312e-05, "loss": 0.2343, "num_input_tokens_seen": 6680192, "step": 6630 }, { "epoch": 3.128241395568128, "grad_norm": 0.09727680683135986, "learning_rate": 4.9517271272423585e-05, "loss": 0.0268, "num_input_tokens_seen": 6685120, "step": 6635 }, { "epoch": 3.1305987741631305, "grad_norm": 1.8549468517303467, "learning_rate": 4.95152576204563e-05, "loss": 0.1816, "num_input_tokens_seen": 6690528, "step": 6640 }, { "epoch": 3.132956152758133, "grad_norm": 0.37919971346855164, "learning_rate": 4.9513239818472154e-05, "loss": 0.0703, "num_input_tokens_seen": 6696352, "step": 6645 }, { "epoch": 3.1353135313531353, "grad_norm": 0.38301563262939453, "learning_rate": 4.9511217866812706e-05, "loss": 0.1116, "num_input_tokens_seen": 6701280, "step": 6650 }, { "epoch": 3.1376709099481377, "grad_norm": 0.29616427421569824, "learning_rate": 4.950919176582026e-05, "loss": 0.1041, "num_input_tokens_seen": 6706144, "step": 6655 }, { "epoch": 3.14002828854314, "grad_norm": 0.10659390687942505, "learning_rate": 4.9507161515837785e-05, "loss": 0.1412, "num_input_tokens_seen": 6711936, "step": 6660 }, { "epoch": 3.1423856671381425, "grad_norm": 0.5828543901443481, "learning_rate": 4.950512711720898e-05, "loss": 0.3026, "num_input_tokens_seen": 6716384, "step": 6665 }, { "epoch": 3.144743045733145, "grad_norm": 2.781531572341919, "learning_rate": 4.9503088570278224e-05, "loss": 0.2784, "num_input_tokens_seen": 6721664, "step": 6670 }, { "epoch": 3.1471004243281473, "grad_norm": 1.3625754117965698, "learning_rate": 4.95010458753906e-05, "loss": 0.0888, "num_input_tokens_seen": 6726880, "step": 6675 }, { "epoch": 3.1494578029231493, "grad_norm": 0.15463541448116302, "learning_rate": 4.949899903289193e-05, "loss": 0.0849, "num_input_tokens_seen": 6732544, "step": 6680 }, { "epoch": 3.1518151815181517, "grad_norm": 3.127394676208496, "learning_rate": 4.949694804312869e-05, "loss": 0.1592, "num_input_tokens_seen": 6740288, "step": 6685 }, { "epoch": 3.154172560113154, "grad_norm": 0.35479068756103516, "learning_rate": 4.949489290644808e-05, "loss": 0.1171, "num_input_tokens_seen": 6747040, "step": 6690 }, { "epoch": 3.1565299387081565, "grad_norm": 1.9038019180297852, "learning_rate": 4.949283362319799e-05, "loss": 0.338, "num_input_tokens_seen": 6752096, "step": 6695 }, { "epoch": 3.158887317303159, "grad_norm": 0.687354564666748, "learning_rate": 4.949077019372704e-05, "loss": 0.0894, "num_input_tokens_seen": 6757184, "step": 6700 }, { "epoch": 3.1612446958981613, "grad_norm": 0.5491605401039124, "learning_rate": 4.948870261838453e-05, "loss": 0.287, "num_input_tokens_seen": 6762688, "step": 6705 }, { "epoch": 3.1636020744931637, "grad_norm": 0.4163500964641571, "learning_rate": 4.948663089752046e-05, "loss": 0.1628, "num_input_tokens_seen": 6766720, "step": 6710 }, { "epoch": 3.165959453088166, "grad_norm": 0.4289177358150482, "learning_rate": 4.948455503148554e-05, "loss": 0.1321, "num_input_tokens_seen": 6771200, "step": 6715 }, { "epoch": 3.1683168316831685, "grad_norm": 0.027863144874572754, "learning_rate": 4.948247502063117e-05, "loss": 0.1946, "num_input_tokens_seen": 6776096, "step": 6720 }, { "epoch": 3.170674210278171, "grad_norm": 0.4896935522556305, "learning_rate": 4.9480390865309466e-05, "loss": 0.1014, "num_input_tokens_seen": 6780192, "step": 6725 }, { "epoch": 3.173031588873173, "grad_norm": 0.06011781841516495, "learning_rate": 4.9478302565873245e-05, "loss": 0.1148, "num_input_tokens_seen": 6784896, "step": 6730 }, { "epoch": 3.1753889674681752, "grad_norm": 1.7688415050506592, "learning_rate": 4.9476210122676025e-05, "loss": 0.26, "num_input_tokens_seen": 6789568, "step": 6735 }, { "epoch": 3.1777463460631776, "grad_norm": 1.61400306224823, "learning_rate": 4.947411353607201e-05, "loss": 0.2349, "num_input_tokens_seen": 6793664, "step": 6740 }, { "epoch": 3.18010372465818, "grad_norm": 0.33232900500297546, "learning_rate": 4.9472012806416114e-05, "loss": 0.2738, "num_input_tokens_seen": 6798816, "step": 6745 }, { "epoch": 3.1824611032531824, "grad_norm": 0.126975879073143, "learning_rate": 4.946990793406396e-05, "loss": 0.14, "num_input_tokens_seen": 6804416, "step": 6750 }, { "epoch": 3.184818481848185, "grad_norm": 0.6540811657905579, "learning_rate": 4.9467798919371874e-05, "loss": 0.212, "num_input_tokens_seen": 6809568, "step": 6755 }, { "epoch": 3.1871758604431872, "grad_norm": 0.563190221786499, "learning_rate": 4.9465685762696874e-05, "loss": 0.2004, "num_input_tokens_seen": 6814912, "step": 6760 }, { "epoch": 3.1895332390381896, "grad_norm": 0.38003042340278625, "learning_rate": 4.946356846439667e-05, "loss": 0.1068, "num_input_tokens_seen": 6819584, "step": 6765 }, { "epoch": 3.191890617633192, "grad_norm": 0.6696445941925049, "learning_rate": 4.9461447024829696e-05, "loss": 0.1054, "num_input_tokens_seen": 6825536, "step": 6770 }, { "epoch": 3.1942479962281944, "grad_norm": 1.214248776435852, "learning_rate": 4.9459321444355077e-05, "loss": 0.2415, "num_input_tokens_seen": 6831264, "step": 6775 }, { "epoch": 3.1966053748231964, "grad_norm": 0.9951841235160828, "learning_rate": 4.945719172333263e-05, "loss": 0.342, "num_input_tokens_seen": 6835648, "step": 6780 }, { "epoch": 3.198962753418199, "grad_norm": 0.5890200734138489, "learning_rate": 4.945505786212288e-05, "loss": 0.0627, "num_input_tokens_seen": 6839936, "step": 6785 }, { "epoch": 3.201320132013201, "grad_norm": 0.3264090418815613, "learning_rate": 4.945291986108707e-05, "loss": 0.27, "num_input_tokens_seen": 6844736, "step": 6790 }, { "epoch": 3.2036775106082036, "grad_norm": 0.9329696297645569, "learning_rate": 4.945077772058709e-05, "loss": 0.3456, "num_input_tokens_seen": 6849536, "step": 6795 }, { "epoch": 3.206034889203206, "grad_norm": 1.77126944065094, "learning_rate": 4.944863144098562e-05, "loss": 0.2046, "num_input_tokens_seen": 6853728, "step": 6800 }, { "epoch": 3.2083922677982084, "grad_norm": 1.4786972999572754, "learning_rate": 4.9446481022645944e-05, "loss": 0.1131, "num_input_tokens_seen": 6859264, "step": 6805 }, { "epoch": 3.210749646393211, "grad_norm": 0.883396327495575, "learning_rate": 4.944432646593211e-05, "loss": 0.0635, "num_input_tokens_seen": 6864352, "step": 6810 }, { "epoch": 3.213107024988213, "grad_norm": 0.22872045636177063, "learning_rate": 4.944216777120885e-05, "loss": 0.0785, "num_input_tokens_seen": 6868576, "step": 6815 }, { "epoch": 3.2154644035832156, "grad_norm": 0.49498990178108215, "learning_rate": 4.9440004938841587e-05, "loss": 0.2133, "num_input_tokens_seen": 6874080, "step": 6820 }, { "epoch": 3.217821782178218, "grad_norm": 0.33165276050567627, "learning_rate": 4.943783796919646e-05, "loss": 0.2976, "num_input_tokens_seen": 6879680, "step": 6825 }, { "epoch": 3.22017916077322, "grad_norm": 0.3651893734931946, "learning_rate": 4.9435666862640294e-05, "loss": 0.1254, "num_input_tokens_seen": 6885760, "step": 6830 }, { "epoch": 3.2225365393682224, "grad_norm": 0.536078929901123, "learning_rate": 4.943349161954062e-05, "loss": 0.2815, "num_input_tokens_seen": 6890784, "step": 6835 }, { "epoch": 3.2248939179632248, "grad_norm": 0.08269314467906952, "learning_rate": 4.9431312240265674e-05, "loss": 0.0555, "num_input_tokens_seen": 6896352, "step": 6840 }, { "epoch": 3.227251296558227, "grad_norm": 0.2190236747264862, "learning_rate": 4.9429128725184385e-05, "loss": 0.2245, "num_input_tokens_seen": 6901216, "step": 6845 }, { "epoch": 3.2296086751532296, "grad_norm": 1.7088310718536377, "learning_rate": 4.9426941074666386e-05, "loss": 0.23, "num_input_tokens_seen": 6906016, "step": 6850 }, { "epoch": 3.231966053748232, "grad_norm": 1.7267204523086548, "learning_rate": 4.942474928908201e-05, "loss": 0.2107, "num_input_tokens_seen": 6911040, "step": 6855 }, { "epoch": 3.2343234323432344, "grad_norm": 0.9023023843765259, "learning_rate": 4.9422553368802294e-05, "loss": 0.1973, "num_input_tokens_seen": 6916160, "step": 6860 }, { "epoch": 3.236680810938237, "grad_norm": 1.0777636766433716, "learning_rate": 4.9420353314198954e-05, "loss": 0.2826, "num_input_tokens_seen": 6920384, "step": 6865 }, { "epoch": 3.239038189533239, "grad_norm": 0.12449587136507034, "learning_rate": 4.941814912564444e-05, "loss": 0.1876, "num_input_tokens_seen": 6924960, "step": 6870 }, { "epoch": 3.2413955681282416, "grad_norm": 0.8197303414344788, "learning_rate": 4.9415940803511864e-05, "loss": 0.0812, "num_input_tokens_seen": 6929280, "step": 6875 }, { "epoch": 3.2437529467232435, "grad_norm": 0.7019719481468201, "learning_rate": 4.941372834817508e-05, "loss": 0.1457, "num_input_tokens_seen": 6935488, "step": 6880 }, { "epoch": 3.246110325318246, "grad_norm": 0.027062561362981796, "learning_rate": 4.94115117600086e-05, "loss": 0.1172, "num_input_tokens_seen": 6939616, "step": 6885 }, { "epoch": 3.2484677039132484, "grad_norm": 0.5643945932388306, "learning_rate": 4.940929103938767e-05, "loss": 0.0793, "num_input_tokens_seen": 6946464, "step": 6890 }, { "epoch": 3.2508250825082508, "grad_norm": 2.631277322769165, "learning_rate": 4.940706618668821e-05, "loss": 0.1558, "num_input_tokens_seen": 6951104, "step": 6895 }, { "epoch": 3.253182461103253, "grad_norm": 1.7241142988204956, "learning_rate": 4.940483720228685e-05, "loss": 0.1548, "num_input_tokens_seen": 6957088, "step": 6900 }, { "epoch": 3.2555398396982556, "grad_norm": 2.36224102973938, "learning_rate": 4.940260408656093e-05, "loss": 0.2396, "num_input_tokens_seen": 6963776, "step": 6905 }, { "epoch": 3.257897218293258, "grad_norm": 0.8468083143234253, "learning_rate": 4.940036683988847e-05, "loss": 0.2265, "num_input_tokens_seen": 6968960, "step": 6910 }, { "epoch": 3.2602545968882604, "grad_norm": 0.2895679473876953, "learning_rate": 4.9398125462648195e-05, "loss": 0.1529, "num_input_tokens_seen": 6974240, "step": 6915 }, { "epoch": 3.2626119754832628, "grad_norm": 0.5320725440979004, "learning_rate": 4.9395879955219534e-05, "loss": 0.1742, "num_input_tokens_seen": 6979104, "step": 6920 }, { "epoch": 3.264969354078265, "grad_norm": 0.2969616651535034, "learning_rate": 4.9393630317982616e-05, "loss": 0.1808, "num_input_tokens_seen": 6984864, "step": 6925 }, { "epoch": 3.2673267326732676, "grad_norm": 0.031184367835521698, "learning_rate": 4.9391376551318265e-05, "loss": 0.0715, "num_input_tokens_seen": 6988768, "step": 6930 }, { "epoch": 3.2696841112682695, "grad_norm": 0.6572713851928711, "learning_rate": 4.9389118655608e-05, "loss": 0.3164, "num_input_tokens_seen": 6995040, "step": 6935 }, { "epoch": 3.272041489863272, "grad_norm": 0.5237798094749451, "learning_rate": 4.9386856631234065e-05, "loss": 0.1177, "num_input_tokens_seen": 7000032, "step": 6940 }, { "epoch": 3.2743988684582743, "grad_norm": 1.7039539813995361, "learning_rate": 4.9384590478579355e-05, "loss": 0.3025, "num_input_tokens_seen": 7004480, "step": 6945 }, { "epoch": 3.2767562470532767, "grad_norm": 2.0193264484405518, "learning_rate": 4.938232019802751e-05, "loss": 0.1527, "num_input_tokens_seen": 7009216, "step": 6950 }, { "epoch": 3.279113625648279, "grad_norm": 1.1230124235153198, "learning_rate": 4.938004578996284e-05, "loss": 0.1262, "num_input_tokens_seen": 7014368, "step": 6955 }, { "epoch": 3.2814710042432815, "grad_norm": 2.476564884185791, "learning_rate": 4.9377767254770377e-05, "loss": 0.2074, "num_input_tokens_seen": 7018880, "step": 6960 }, { "epoch": 3.283828382838284, "grad_norm": 0.2735523283481598, "learning_rate": 4.937548459283582e-05, "loss": 0.1769, "num_input_tokens_seen": 7023264, "step": 6965 }, { "epoch": 3.2861857614332863, "grad_norm": 0.33743834495544434, "learning_rate": 4.937319780454559e-05, "loss": 0.2032, "num_input_tokens_seen": 7028000, "step": 6970 }, { "epoch": 3.2885431400282887, "grad_norm": 0.5347578525543213, "learning_rate": 4.9370906890286815e-05, "loss": 0.1625, "num_input_tokens_seen": 7032960, "step": 6975 }, { "epoch": 3.2909005186232907, "grad_norm": 1.4062138795852661, "learning_rate": 4.9368611850447287e-05, "loss": 0.2326, "num_input_tokens_seen": 7037504, "step": 6980 }, { "epoch": 3.293257897218293, "grad_norm": 1.1476725339889526, "learning_rate": 4.936631268541554e-05, "loss": 0.1388, "num_input_tokens_seen": 7042304, "step": 6985 }, { "epoch": 3.2956152758132955, "grad_norm": 0.9895498752593994, "learning_rate": 4.936400939558076e-05, "loss": 0.1729, "num_input_tokens_seen": 7046528, "step": 6990 }, { "epoch": 3.297972654408298, "grad_norm": 0.6755147576332092, "learning_rate": 4.9361701981332875e-05, "loss": 0.1109, "num_input_tokens_seen": 7050496, "step": 6995 }, { "epoch": 3.3003300330033003, "grad_norm": 2.052471876144409, "learning_rate": 4.9359390443062484e-05, "loss": 0.2513, "num_input_tokens_seen": 7055776, "step": 7000 }, { "epoch": 3.3026874115983027, "grad_norm": 0.9573507308959961, "learning_rate": 4.935707478116089e-05, "loss": 0.1309, "num_input_tokens_seen": 7061248, "step": 7005 }, { "epoch": 3.305044790193305, "grad_norm": 1.179573655128479, "learning_rate": 4.9354754996020084e-05, "loss": 0.147, "num_input_tokens_seen": 7065856, "step": 7010 }, { "epoch": 3.3074021687883075, "grad_norm": 0.5070284008979797, "learning_rate": 4.935243108803279e-05, "loss": 0.1193, "num_input_tokens_seen": 7069568, "step": 7015 }, { "epoch": 3.30975954738331, "grad_norm": 0.5700567364692688, "learning_rate": 4.935010305759238e-05, "loss": 0.1827, "num_input_tokens_seen": 7074592, "step": 7020 }, { "epoch": 3.3121169259783123, "grad_norm": 2.005483388900757, "learning_rate": 4.9347770905092976e-05, "loss": 0.2869, "num_input_tokens_seen": 7080032, "step": 7025 }, { "epoch": 3.3144743045733147, "grad_norm": 2.019970178604126, "learning_rate": 4.9345434630929353e-05, "loss": 0.1718, "num_input_tokens_seen": 7083968, "step": 7030 }, { "epoch": 3.3168316831683167, "grad_norm": 0.41321003437042236, "learning_rate": 4.934309423549701e-05, "loss": 0.0985, "num_input_tokens_seen": 7089600, "step": 7035 }, { "epoch": 3.319189061763319, "grad_norm": 0.299960732460022, "learning_rate": 4.9340749719192136e-05, "loss": 0.1094, "num_input_tokens_seen": 7094976, "step": 7040 }, { "epoch": 3.3215464403583215, "grad_norm": 0.27455419301986694, "learning_rate": 4.933840108241162e-05, "loss": 0.2823, "num_input_tokens_seen": 7099424, "step": 7045 }, { "epoch": 3.323903818953324, "grad_norm": 1.786044955253601, "learning_rate": 4.933604832555304e-05, "loss": 0.1926, "num_input_tokens_seen": 7104448, "step": 7050 }, { "epoch": 3.3262611975483263, "grad_norm": 1.0163277387619019, "learning_rate": 4.9333691449014686e-05, "loss": 0.2503, "num_input_tokens_seen": 7109440, "step": 7055 }, { "epoch": 3.3286185761433287, "grad_norm": 0.6182330846786499, "learning_rate": 4.933133045319552e-05, "loss": 0.1661, "num_input_tokens_seen": 7114496, "step": 7060 }, { "epoch": 3.330975954738331, "grad_norm": 0.4769211411476135, "learning_rate": 4.932896533849524e-05, "loss": 0.2245, "num_input_tokens_seen": 7119360, "step": 7065 }, { "epoch": 3.3333333333333335, "grad_norm": 1.1860930919647217, "learning_rate": 4.9326596105314214e-05, "loss": 0.182, "num_input_tokens_seen": 7124160, "step": 7070 }, { "epoch": 3.335690711928336, "grad_norm": 0.22837035357952118, "learning_rate": 4.932422275405351e-05, "loss": 0.4256, "num_input_tokens_seen": 7128960, "step": 7075 }, { "epoch": 3.338048090523338, "grad_norm": 0.5166279077529907, "learning_rate": 4.9321845285114884e-05, "loss": 0.147, "num_input_tokens_seen": 7133888, "step": 7080 }, { "epoch": 3.3404054691183402, "grad_norm": 1.5478862524032593, "learning_rate": 4.9319463698900817e-05, "loss": 0.1763, "num_input_tokens_seen": 7138400, "step": 7085 }, { "epoch": 3.3427628477133426, "grad_norm": 0.6321823596954346, "learning_rate": 4.931707799581447e-05, "loss": 0.1279, "num_input_tokens_seen": 7144544, "step": 7090 }, { "epoch": 3.345120226308345, "grad_norm": 0.4564293324947357, "learning_rate": 4.9314688176259694e-05, "loss": 0.0906, "num_input_tokens_seen": 7149312, "step": 7095 }, { "epoch": 3.3474776049033474, "grad_norm": 2.3332955837249756, "learning_rate": 4.931229424064105e-05, "loss": 0.143, "num_input_tokens_seen": 7154208, "step": 7100 }, { "epoch": 3.34983498349835, "grad_norm": 1.190237045288086, "learning_rate": 4.93098961893638e-05, "loss": 0.1592, "num_input_tokens_seen": 7158528, "step": 7105 }, { "epoch": 3.3521923620933523, "grad_norm": 1.378997802734375, "learning_rate": 4.930749402283388e-05, "loss": 0.2855, "num_input_tokens_seen": 7163040, "step": 7110 }, { "epoch": 3.3545497406883547, "grad_norm": 0.2361306995153427, "learning_rate": 4.9305087741457935e-05, "loss": 0.2073, "num_input_tokens_seen": 7169024, "step": 7115 }, { "epoch": 3.356907119283357, "grad_norm": 0.5036490559577942, "learning_rate": 4.930267734564331e-05, "loss": 0.2195, "num_input_tokens_seen": 7173664, "step": 7120 }, { "epoch": 3.3592644978783595, "grad_norm": 0.9150017499923706, "learning_rate": 4.930026283579805e-05, "loss": 0.2795, "num_input_tokens_seen": 7178688, "step": 7125 }, { "epoch": 3.361621876473362, "grad_norm": 0.23477236926555634, "learning_rate": 4.929784421233089e-05, "loss": 0.0922, "num_input_tokens_seen": 7183680, "step": 7130 }, { "epoch": 3.363979255068364, "grad_norm": 1.120851993560791, "learning_rate": 4.9295421475651256e-05, "loss": 0.236, "num_input_tokens_seen": 7188352, "step": 7135 }, { "epoch": 3.366336633663366, "grad_norm": 0.8157948851585388, "learning_rate": 4.9292994626169276e-05, "loss": 0.2867, "num_input_tokens_seen": 7193632, "step": 7140 }, { "epoch": 3.3686940122583686, "grad_norm": 0.3477994501590729, "learning_rate": 4.9290563664295785e-05, "loss": 0.2128, "num_input_tokens_seen": 7197568, "step": 7145 }, { "epoch": 3.371051390853371, "grad_norm": 1.2495590448379517, "learning_rate": 4.928812859044228e-05, "loss": 0.2403, "num_input_tokens_seen": 7202880, "step": 7150 }, { "epoch": 3.3734087694483734, "grad_norm": 0.3181343972682953, "learning_rate": 4.928568940502101e-05, "loss": 0.1166, "num_input_tokens_seen": 7207264, "step": 7155 }, { "epoch": 3.375766148043376, "grad_norm": 1.3861693143844604, "learning_rate": 4.928324610844487e-05, "loss": 0.1851, "num_input_tokens_seen": 7213472, "step": 7160 }, { "epoch": 3.3781235266383782, "grad_norm": 0.23976163566112518, "learning_rate": 4.928079870112746e-05, "loss": 0.1604, "num_input_tokens_seen": 7218656, "step": 7165 }, { "epoch": 3.3804809052333806, "grad_norm": 0.6488722562789917, "learning_rate": 4.927834718348311e-05, "loss": 0.2417, "num_input_tokens_seen": 7224224, "step": 7170 }, { "epoch": 3.382838283828383, "grad_norm": 0.36592230200767517, "learning_rate": 4.927589155592679e-05, "loss": 0.0834, "num_input_tokens_seen": 7228864, "step": 7175 }, { "epoch": 3.385195662423385, "grad_norm": 0.7411390542984009, "learning_rate": 4.927343181887422e-05, "loss": 0.1975, "num_input_tokens_seen": 7233824, "step": 7180 }, { "epoch": 3.3875530410183874, "grad_norm": 0.37082889676094055, "learning_rate": 4.927096797274178e-05, "loss": 0.053, "num_input_tokens_seen": 7239584, "step": 7185 }, { "epoch": 3.38991041961339, "grad_norm": 1.2777973413467407, "learning_rate": 4.926850001794655e-05, "loss": 0.1343, "num_input_tokens_seen": 7244992, "step": 7190 }, { "epoch": 3.392267798208392, "grad_norm": 1.3291714191436768, "learning_rate": 4.926602795490633e-05, "loss": 0.1511, "num_input_tokens_seen": 7249184, "step": 7195 }, { "epoch": 3.3946251768033946, "grad_norm": 0.26084399223327637, "learning_rate": 4.9263551784039595e-05, "loss": 0.1454, "num_input_tokens_seen": 7254976, "step": 7200 }, { "epoch": 3.396982555398397, "grad_norm": 1.117848515510559, "learning_rate": 4.926107150576551e-05, "loss": 0.0987, "num_input_tokens_seen": 7259712, "step": 7205 }, { "epoch": 3.3993399339933994, "grad_norm": 2.3996777534484863, "learning_rate": 4.925858712050395e-05, "loss": 0.3942, "num_input_tokens_seen": 7263904, "step": 7210 }, { "epoch": 3.401697312588402, "grad_norm": 0.15875235199928284, "learning_rate": 4.9256098628675476e-05, "loss": 0.065, "num_input_tokens_seen": 7268288, "step": 7215 }, { "epoch": 3.404054691183404, "grad_norm": 1.4527957439422607, "learning_rate": 4.925360603070135e-05, "loss": 0.1392, "num_input_tokens_seen": 7272128, "step": 7220 }, { "epoch": 3.4064120697784066, "grad_norm": 1.0496182441711426, "learning_rate": 4.925110932700353e-05, "loss": 0.2349, "num_input_tokens_seen": 7276448, "step": 7225 }, { "epoch": 3.408769448373409, "grad_norm": 0.975279688835144, "learning_rate": 4.9248608518004656e-05, "loss": 0.2057, "num_input_tokens_seen": 7280896, "step": 7230 }, { "epoch": 3.411126826968411, "grad_norm": 0.6059332489967346, "learning_rate": 4.924610360412808e-05, "loss": 0.1759, "num_input_tokens_seen": 7286080, "step": 7235 }, { "epoch": 3.4134842055634134, "grad_norm": 1.0286730527877808, "learning_rate": 4.9243594585797836e-05, "loss": 0.2029, "num_input_tokens_seen": 7291232, "step": 7240 }, { "epoch": 3.4158415841584158, "grad_norm": 0.0433526486158371, "learning_rate": 4.924108146343867e-05, "loss": 0.1034, "num_input_tokens_seen": 7297184, "step": 7245 }, { "epoch": 3.418198962753418, "grad_norm": 0.33343306183815, "learning_rate": 4.9238564237475995e-05, "loss": 0.0798, "num_input_tokens_seen": 7303360, "step": 7250 }, { "epoch": 3.4205563413484206, "grad_norm": 0.2721928060054779, "learning_rate": 4.923604290833594e-05, "loss": 0.0876, "num_input_tokens_seen": 7308544, "step": 7255 }, { "epoch": 3.422913719943423, "grad_norm": 1.1016851663589478, "learning_rate": 4.9233517476445335e-05, "loss": 0.13, "num_input_tokens_seen": 7313120, "step": 7260 }, { "epoch": 3.4252710985384254, "grad_norm": 1.2764511108398438, "learning_rate": 4.923098794223168e-05, "loss": 0.2195, "num_input_tokens_seen": 7318464, "step": 7265 }, { "epoch": 3.4276284771334278, "grad_norm": 0.4314095675945282, "learning_rate": 4.922845430612318e-05, "loss": 0.0984, "num_input_tokens_seen": 7323168, "step": 7270 }, { "epoch": 3.42998585572843, "grad_norm": 0.5786857604980469, "learning_rate": 4.9225916568548744e-05, "loss": 0.1493, "num_input_tokens_seen": 7328320, "step": 7275 }, { "epoch": 3.432343234323432, "grad_norm": 2.3598053455352783, "learning_rate": 4.922337472993796e-05, "loss": 0.2109, "num_input_tokens_seen": 7334656, "step": 7280 }, { "epoch": 3.4347006129184345, "grad_norm": 0.26124975085258484, "learning_rate": 4.922082879072113e-05, "loss": 0.103, "num_input_tokens_seen": 7339616, "step": 7285 }, { "epoch": 3.437057991513437, "grad_norm": 0.8375787138938904, "learning_rate": 4.921827875132924e-05, "loss": 0.1321, "num_input_tokens_seen": 7344256, "step": 7290 }, { "epoch": 3.4394153701084393, "grad_norm": 0.26153868436813354, "learning_rate": 4.9215724612193946e-05, "loss": 0.0375, "num_input_tokens_seen": 7348320, "step": 7295 }, { "epoch": 3.4417727487034417, "grad_norm": 1.1595031023025513, "learning_rate": 4.921316637374764e-05, "loss": 0.1469, "num_input_tokens_seen": 7352640, "step": 7300 }, { "epoch": 3.444130127298444, "grad_norm": 0.5361219048500061, "learning_rate": 4.921060403642338e-05, "loss": 0.0997, "num_input_tokens_seen": 7357376, "step": 7305 }, { "epoch": 3.4464875058934465, "grad_norm": 0.02034725621342659, "learning_rate": 4.920803760065494e-05, "loss": 0.101, "num_input_tokens_seen": 7361600, "step": 7310 }, { "epoch": 3.448844884488449, "grad_norm": 0.6235990524291992, "learning_rate": 4.920546706687675e-05, "loss": 0.1894, "num_input_tokens_seen": 7366592, "step": 7315 }, { "epoch": 3.4512022630834513, "grad_norm": 0.20001043379306793, "learning_rate": 4.9202892435523987e-05, "loss": 0.0516, "num_input_tokens_seen": 7371520, "step": 7320 }, { "epoch": 3.4535596416784538, "grad_norm": 0.5185422897338867, "learning_rate": 4.920031370703246e-05, "loss": 0.1449, "num_input_tokens_seen": 7376896, "step": 7325 }, { "epoch": 3.455917020273456, "grad_norm": 0.9561343789100647, "learning_rate": 4.9197730881838724e-05, "loss": 0.1088, "num_input_tokens_seen": 7381472, "step": 7330 }, { "epoch": 3.458274398868458, "grad_norm": 0.250349223613739, "learning_rate": 4.9195143960380006e-05, "loss": 0.0283, "num_input_tokens_seen": 7385728, "step": 7335 }, { "epoch": 3.4606317774634605, "grad_norm": 0.805144727230072, "learning_rate": 4.9192552943094225e-05, "loss": 0.1295, "num_input_tokens_seen": 7390432, "step": 7340 }, { "epoch": 3.462989156058463, "grad_norm": 0.8586658239364624, "learning_rate": 4.918995783042e-05, "loss": 0.116, "num_input_tokens_seen": 7396128, "step": 7345 }, { "epoch": 3.4653465346534653, "grad_norm": 0.3361653983592987, "learning_rate": 4.9187358622796636e-05, "loss": 0.0903, "num_input_tokens_seen": 7400960, "step": 7350 }, { "epoch": 3.4677039132484677, "grad_norm": 1.3102538585662842, "learning_rate": 4.918475532066413e-05, "loss": 0.0845, "num_input_tokens_seen": 7405888, "step": 7355 }, { "epoch": 3.47006129184347, "grad_norm": 0.8349955677986145, "learning_rate": 4.918214792446319e-05, "loss": 0.3794, "num_input_tokens_seen": 7411072, "step": 7360 }, { "epoch": 3.4724186704384725, "grad_norm": 1.1816649436950684, "learning_rate": 4.9179536434635195e-05, "loss": 0.1961, "num_input_tokens_seen": 7417120, "step": 7365 }, { "epoch": 3.474776049033475, "grad_norm": 0.5488105416297913, "learning_rate": 4.917692085162222e-05, "loss": 0.1095, "num_input_tokens_seen": 7422592, "step": 7370 }, { "epoch": 3.4771334276284773, "grad_norm": 0.8846815228462219, "learning_rate": 4.917430117586705e-05, "loss": 0.1384, "num_input_tokens_seen": 7426848, "step": 7375 }, { "epoch": 3.4794908062234793, "grad_norm": 0.946752667427063, "learning_rate": 4.9171677407813146e-05, "loss": 0.1534, "num_input_tokens_seen": 7431712, "step": 7380 }, { "epoch": 3.4818481848184817, "grad_norm": 0.19753342866897583, "learning_rate": 4.916904954790467e-05, "loss": 0.2677, "num_input_tokens_seen": 7437088, "step": 7385 }, { "epoch": 3.484205563413484, "grad_norm": 1.8466660976409912, "learning_rate": 4.9166417596586464e-05, "loss": 0.1956, "num_input_tokens_seen": 7442240, "step": 7390 }, { "epoch": 3.4865629420084865, "grad_norm": 0.10067445784807205, "learning_rate": 4.916378155430409e-05, "loss": 0.044, "num_input_tokens_seen": 7446944, "step": 7395 }, { "epoch": 3.488920320603489, "grad_norm": 1.0351626873016357, "learning_rate": 4.916114142150378e-05, "loss": 0.1725, "num_input_tokens_seen": 7451328, "step": 7400 }, { "epoch": 3.4912776991984913, "grad_norm": 0.1645953506231308, "learning_rate": 4.915849719863245e-05, "loss": 0.255, "num_input_tokens_seen": 7456000, "step": 7405 }, { "epoch": 3.4936350777934937, "grad_norm": 0.6228551268577576, "learning_rate": 4.915584888613775e-05, "loss": 0.1793, "num_input_tokens_seen": 7460384, "step": 7410 }, { "epoch": 3.495992456388496, "grad_norm": 1.3886631727218628, "learning_rate": 4.915319648446795e-05, "loss": 0.1451, "num_input_tokens_seen": 7465920, "step": 7415 }, { "epoch": 3.4983498349834985, "grad_norm": 0.5533658266067505, "learning_rate": 4.9150539994072103e-05, "loss": 0.2229, "num_input_tokens_seen": 7470752, "step": 7420 }, { "epoch": 3.500707213578501, "grad_norm": 0.761064350605011, "learning_rate": 4.914787941539988e-05, "loss": 0.2288, "num_input_tokens_seen": 7475168, "step": 7425 }, { "epoch": 3.5030645921735033, "grad_norm": 0.6145287156105042, "learning_rate": 4.914521474890168e-05, "loss": 0.2017, "num_input_tokens_seen": 7479648, "step": 7430 }, { "epoch": 3.5054219707685053, "grad_norm": 0.47449004650115967, "learning_rate": 4.9142545995028586e-05, "loss": 0.0546, "num_input_tokens_seen": 7483872, "step": 7435 }, { "epoch": 3.5077793493635077, "grad_norm": 0.653410792350769, "learning_rate": 4.913987315423237e-05, "loss": 0.0871, "num_input_tokens_seen": 7489152, "step": 7440 }, { "epoch": 3.51013672795851, "grad_norm": 1.3233697414398193, "learning_rate": 4.913719622696551e-05, "loss": 0.1702, "num_input_tokens_seen": 7493760, "step": 7445 }, { "epoch": 3.5124941065535125, "grad_norm": 0.4158790409564972, "learning_rate": 4.9134515213681154e-05, "loss": 0.1967, "num_input_tokens_seen": 7497952, "step": 7450 }, { "epoch": 3.514851485148515, "grad_norm": 1.0929076671600342, "learning_rate": 4.913183011483314e-05, "loss": 0.1543, "num_input_tokens_seen": 7503872, "step": 7455 }, { "epoch": 3.5172088637435173, "grad_norm": 0.7875869274139404, "learning_rate": 4.9129140930876036e-05, "loss": 0.1466, "num_input_tokens_seen": 7509152, "step": 7460 }, { "epoch": 3.5195662423385197, "grad_norm": 0.4042298495769501, "learning_rate": 4.912644766226505e-05, "loss": 0.1066, "num_input_tokens_seen": 7513952, "step": 7465 }, { "epoch": 3.521923620933522, "grad_norm": 1.7084076404571533, "learning_rate": 4.912375030945613e-05, "loss": 0.3186, "num_input_tokens_seen": 7518720, "step": 7470 }, { "epoch": 3.524280999528524, "grad_norm": 1.2692044973373413, "learning_rate": 4.912104887290587e-05, "loss": 0.1832, "num_input_tokens_seen": 7523648, "step": 7475 }, { "epoch": 3.5266383781235264, "grad_norm": 1.8074496984481812, "learning_rate": 4.91183433530716e-05, "loss": 0.1126, "num_input_tokens_seen": 7528256, "step": 7480 }, { "epoch": 3.528995756718529, "grad_norm": 0.8402618169784546, "learning_rate": 4.91156337504113e-05, "loss": 0.1206, "num_input_tokens_seen": 7533248, "step": 7485 }, { "epoch": 3.5313531353135312, "grad_norm": 1.246201515197754, "learning_rate": 4.911292006538367e-05, "loss": 0.376, "num_input_tokens_seen": 7538272, "step": 7490 }, { "epoch": 3.5337105139085336, "grad_norm": 1.7451468706130981, "learning_rate": 4.911020229844808e-05, "loss": 0.1718, "num_input_tokens_seen": 7543872, "step": 7495 }, { "epoch": 3.536067892503536, "grad_norm": 1.2862194776535034, "learning_rate": 4.9107480450064616e-05, "loss": 0.17, "num_input_tokens_seen": 7548576, "step": 7500 }, { "epoch": 3.5384252710985384, "grad_norm": 0.24802406132221222, "learning_rate": 4.910475452069403e-05, "loss": 0.2481, "num_input_tokens_seen": 7554336, "step": 7505 }, { "epoch": 3.540782649693541, "grad_norm": 0.6186949014663696, "learning_rate": 4.9102024510797775e-05, "loss": 0.0779, "num_input_tokens_seen": 7558880, "step": 7510 }, { "epoch": 3.5431400282885432, "grad_norm": 0.24871201813220978, "learning_rate": 4.9099290420838e-05, "loss": 0.095, "num_input_tokens_seen": 7563264, "step": 7515 }, { "epoch": 3.5454974068835456, "grad_norm": 0.39643606543540955, "learning_rate": 4.9096552251277544e-05, "loss": 0.1982, "num_input_tokens_seen": 7568064, "step": 7520 }, { "epoch": 3.547854785478548, "grad_norm": 0.8192881941795349, "learning_rate": 4.909381000257993e-05, "loss": 0.116, "num_input_tokens_seen": 7574432, "step": 7525 }, { "epoch": 3.5502121640735504, "grad_norm": 0.40203210711479187, "learning_rate": 4.909106367520937e-05, "loss": 0.2374, "num_input_tokens_seen": 7579328, "step": 7530 }, { "epoch": 3.5525695426685524, "grad_norm": 0.8850836753845215, "learning_rate": 4.908831326963077e-05, "loss": 0.1258, "num_input_tokens_seen": 7584608, "step": 7535 }, { "epoch": 3.554926921263555, "grad_norm": 1.6599147319793701, "learning_rate": 4.9085558786309726e-05, "loss": 0.1575, "num_input_tokens_seen": 7589600, "step": 7540 }, { "epoch": 3.557284299858557, "grad_norm": 0.6544680595397949, "learning_rate": 4.908280022571254e-05, "loss": 0.0908, "num_input_tokens_seen": 7594624, "step": 7545 }, { "epoch": 3.5596416784535596, "grad_norm": 1.3114084005355835, "learning_rate": 4.908003758830617e-05, "loss": 0.2299, "num_input_tokens_seen": 7599360, "step": 7550 }, { "epoch": 3.561999057048562, "grad_norm": 2.096932888031006, "learning_rate": 4.9077270874558296e-05, "loss": 0.2312, "num_input_tokens_seen": 7603872, "step": 7555 }, { "epoch": 3.5643564356435644, "grad_norm": 0.45404160022735596, "learning_rate": 4.907450008493727e-05, "loss": 0.0706, "num_input_tokens_seen": 7608160, "step": 7560 }, { "epoch": 3.566713814238567, "grad_norm": 0.8907732963562012, "learning_rate": 4.9071725219912143e-05, "loss": 0.1772, "num_input_tokens_seen": 7614336, "step": 7565 }, { "epoch": 3.569071192833569, "grad_norm": 0.44665828347206116, "learning_rate": 4.906894627995265e-05, "loss": 0.1923, "num_input_tokens_seen": 7618592, "step": 7570 }, { "epoch": 3.571428571428571, "grad_norm": 0.5571368336677551, "learning_rate": 4.9066163265529216e-05, "loss": 0.0806, "num_input_tokens_seen": 7623296, "step": 7575 }, { "epoch": 3.5737859500235736, "grad_norm": 0.5175835490226746, "learning_rate": 4.9063376177112966e-05, "loss": 0.1026, "num_input_tokens_seen": 7629152, "step": 7580 }, { "epoch": 3.576143328618576, "grad_norm": 0.5670937299728394, "learning_rate": 4.9060585015175695e-05, "loss": 0.0392, "num_input_tokens_seen": 7633312, "step": 7585 }, { "epoch": 3.5785007072135784, "grad_norm": 0.19540858268737793, "learning_rate": 4.905778978018992e-05, "loss": 0.0673, "num_input_tokens_seen": 7637376, "step": 7590 }, { "epoch": 3.580858085808581, "grad_norm": 1.1667894124984741, "learning_rate": 4.90549904726288e-05, "loss": 0.2387, "num_input_tokens_seen": 7642080, "step": 7595 }, { "epoch": 3.583215464403583, "grad_norm": 0.6392051577568054, "learning_rate": 4.9052187092966225e-05, "loss": 0.1214, "num_input_tokens_seen": 7646752, "step": 7600 }, { "epoch": 3.5855728429985856, "grad_norm": 3.0480544567108154, "learning_rate": 4.904937964167676e-05, "loss": 0.2618, "num_input_tokens_seen": 7651200, "step": 7605 }, { "epoch": 3.587930221593588, "grad_norm": 0.32447707653045654, "learning_rate": 4.904656811923567e-05, "loss": 0.2343, "num_input_tokens_seen": 7655456, "step": 7610 }, { "epoch": 3.5902876001885904, "grad_norm": 0.1386127918958664, "learning_rate": 4.904375252611887e-05, "loss": 0.0823, "num_input_tokens_seen": 7659360, "step": 7615 }, { "epoch": 3.592644978783593, "grad_norm": 0.4599516987800598, "learning_rate": 4.9040932862803014e-05, "loss": 0.2061, "num_input_tokens_seen": 7663904, "step": 7620 }, { "epoch": 3.595002357378595, "grad_norm": 0.533892035484314, "learning_rate": 4.9038109129765405e-05, "loss": 0.1298, "num_input_tokens_seen": 7668352, "step": 7625 }, { "epoch": 3.5973597359735976, "grad_norm": 0.690556526184082, "learning_rate": 4.9035281327484075e-05, "loss": 0.0807, "num_input_tokens_seen": 7673952, "step": 7630 }, { "epoch": 3.5997171145685996, "grad_norm": 0.18706220388412476, "learning_rate": 4.9032449456437706e-05, "loss": 0.1476, "num_input_tokens_seen": 7679648, "step": 7635 }, { "epoch": 3.602074493163602, "grad_norm": 0.7550817728042603, "learning_rate": 4.90296135171057e-05, "loss": 0.0787, "num_input_tokens_seen": 7684192, "step": 7640 }, { "epoch": 3.6044318717586044, "grad_norm": 0.384054958820343, "learning_rate": 4.9026773509968115e-05, "loss": 0.0634, "num_input_tokens_seen": 7689376, "step": 7645 }, { "epoch": 3.6067892503536068, "grad_norm": 0.04242413491010666, "learning_rate": 4.902392943550573e-05, "loss": 0.0526, "num_input_tokens_seen": 7694336, "step": 7650 }, { "epoch": 3.609146628948609, "grad_norm": 0.14247441291809082, "learning_rate": 4.90210812942e-05, "loss": 0.1138, "num_input_tokens_seen": 7699648, "step": 7655 }, { "epoch": 3.6115040075436116, "grad_norm": 0.7321284413337708, "learning_rate": 4.901822908653305e-05, "loss": 0.113, "num_input_tokens_seen": 7705184, "step": 7660 }, { "epoch": 3.613861386138614, "grad_norm": 0.3674927055835724, "learning_rate": 4.9015372812987734e-05, "loss": 0.106, "num_input_tokens_seen": 7712064, "step": 7665 }, { "epoch": 3.6162187647336164, "grad_norm": 0.610046923160553, "learning_rate": 4.901251247404756e-05, "loss": 0.1521, "num_input_tokens_seen": 7717216, "step": 7670 }, { "epoch": 3.6185761433286183, "grad_norm": 0.5747331380844116, "learning_rate": 4.900964807019672e-05, "loss": 0.0978, "num_input_tokens_seen": 7722848, "step": 7675 }, { "epoch": 3.6209335219236207, "grad_norm": 1.2257956266403198, "learning_rate": 4.900677960192013e-05, "loss": 0.0757, "num_input_tokens_seen": 7727872, "step": 7680 }, { "epoch": 3.623290900518623, "grad_norm": 1.1684919595718384, "learning_rate": 4.9003907069703364e-05, "loss": 0.1945, "num_input_tokens_seen": 7733056, "step": 7685 }, { "epoch": 3.6256482791136255, "grad_norm": 0.3908778429031372, "learning_rate": 4.9001030474032695e-05, "loss": 0.1034, "num_input_tokens_seen": 7737792, "step": 7690 }, { "epoch": 3.628005657708628, "grad_norm": 0.5867111682891846, "learning_rate": 4.899814981539508e-05, "loss": 0.1711, "num_input_tokens_seen": 7742016, "step": 7695 }, { "epoch": 3.6303630363036303, "grad_norm": 0.41619452834129333, "learning_rate": 4.899526509427817e-05, "loss": 0.1318, "num_input_tokens_seen": 7749632, "step": 7700 }, { "epoch": 3.6327204148986327, "grad_norm": 0.953095018863678, "learning_rate": 4.8992376311170296e-05, "loss": 0.0966, "num_input_tokens_seen": 7754656, "step": 7705 }, { "epoch": 3.635077793493635, "grad_norm": 0.9920178055763245, "learning_rate": 4.8989483466560475e-05, "loss": 0.1708, "num_input_tokens_seen": 7759424, "step": 7710 }, { "epoch": 3.6374351720886375, "grad_norm": 0.18388615548610687, "learning_rate": 4.8986586560938426e-05, "loss": 0.051, "num_input_tokens_seen": 7764672, "step": 7715 }, { "epoch": 3.63979255068364, "grad_norm": 1.0399339199066162, "learning_rate": 4.898368559479454e-05, "loss": 0.1125, "num_input_tokens_seen": 7770880, "step": 7720 }, { "epoch": 3.6421499292786423, "grad_norm": 0.07374304533004761, "learning_rate": 4.89807805686199e-05, "loss": 0.027, "num_input_tokens_seen": 7775744, "step": 7725 }, { "epoch": 3.6445073078736447, "grad_norm": 1.5985530614852905, "learning_rate": 4.897787148290628e-05, "loss": 0.1876, "num_input_tokens_seen": 7780320, "step": 7730 }, { "epoch": 3.6468646864686467, "grad_norm": 0.11479198932647705, "learning_rate": 4.897495833814614e-05, "loss": 0.1216, "num_input_tokens_seen": 7784768, "step": 7735 }, { "epoch": 3.649222065063649, "grad_norm": 0.6236423254013062, "learning_rate": 4.8972041134832626e-05, "loss": 0.1781, "num_input_tokens_seen": 7790464, "step": 7740 }, { "epoch": 3.6515794436586515, "grad_norm": 1.9624345302581787, "learning_rate": 4.896911987345957e-05, "loss": 0.298, "num_input_tokens_seen": 7795584, "step": 7745 }, { "epoch": 3.653936822253654, "grad_norm": 0.7252243757247925, "learning_rate": 4.896619455452149e-05, "loss": 0.1831, "num_input_tokens_seen": 7800864, "step": 7750 }, { "epoch": 3.6562942008486563, "grad_norm": 1.0672520399093628, "learning_rate": 4.89632651785136e-05, "loss": 0.0736, "num_input_tokens_seen": 7804896, "step": 7755 }, { "epoch": 3.6586515794436587, "grad_norm": 1.5708897113800049, "learning_rate": 4.8960331745931783e-05, "loss": 0.2575, "num_input_tokens_seen": 7809376, "step": 7760 }, { "epoch": 3.661008958038661, "grad_norm": 1.2569149732589722, "learning_rate": 4.895739425727263e-05, "loss": 0.2276, "num_input_tokens_seen": 7813920, "step": 7765 }, { "epoch": 3.6633663366336635, "grad_norm": 1.531715750694275, "learning_rate": 4.89544527130334e-05, "loss": 0.213, "num_input_tokens_seen": 7818112, "step": 7770 }, { "epoch": 3.6657237152286655, "grad_norm": 0.3792062997817993, "learning_rate": 4.8951507113712045e-05, "loss": 0.2047, "num_input_tokens_seen": 7823104, "step": 7775 }, { "epoch": 3.668081093823668, "grad_norm": 1.2320915460586548, "learning_rate": 4.894855745980722e-05, "loss": 0.2075, "num_input_tokens_seen": 7828256, "step": 7780 }, { "epoch": 3.6704384724186703, "grad_norm": 0.29769808053970337, "learning_rate": 4.894560375181823e-05, "loss": 0.1392, "num_input_tokens_seen": 7833600, "step": 7785 }, { "epoch": 3.6727958510136727, "grad_norm": 0.4862428307533264, "learning_rate": 4.8942645990245095e-05, "loss": 0.1918, "num_input_tokens_seen": 7838208, "step": 7790 }, { "epoch": 3.675153229608675, "grad_norm": 0.0259744543582201, "learning_rate": 4.893968417558853e-05, "loss": 0.1692, "num_input_tokens_seen": 7843296, "step": 7795 }, { "epoch": 3.6775106082036775, "grad_norm": 0.3799028992652893, "learning_rate": 4.8936718308349894e-05, "loss": 0.2146, "num_input_tokens_seen": 7849024, "step": 7800 }, { "epoch": 3.67986798679868, "grad_norm": 1.0625935792922974, "learning_rate": 4.8933748389031276e-05, "loss": 0.2808, "num_input_tokens_seen": 7853792, "step": 7805 }, { "epoch": 3.6822253653936823, "grad_norm": 1.8956362009048462, "learning_rate": 4.893077441813543e-05, "loss": 0.1465, "num_input_tokens_seen": 7857824, "step": 7810 }, { "epoch": 3.6845827439886847, "grad_norm": 0.36768093705177307, "learning_rate": 4.892779639616579e-05, "loss": 0.0439, "num_input_tokens_seen": 7863008, "step": 7815 }, { "epoch": 3.686940122583687, "grad_norm": 1.8955450057983398, "learning_rate": 4.89248143236265e-05, "loss": 0.2856, "num_input_tokens_seen": 7867968, "step": 7820 }, { "epoch": 3.6892975011786895, "grad_norm": 0.24656188488006592, "learning_rate": 4.8921828201022365e-05, "loss": 0.0462, "num_input_tokens_seen": 7872928, "step": 7825 }, { "epoch": 3.691654879773692, "grad_norm": 0.19637134671211243, "learning_rate": 4.8918838028858874e-05, "loss": 0.1371, "num_input_tokens_seen": 7878304, "step": 7830 }, { "epoch": 3.694012258368694, "grad_norm": 0.8248302340507507, "learning_rate": 4.891584380764224e-05, "loss": 0.1063, "num_input_tokens_seen": 7883168, "step": 7835 }, { "epoch": 3.6963696369636962, "grad_norm": 1.824679970741272, "learning_rate": 4.8912845537879305e-05, "loss": 0.1652, "num_input_tokens_seen": 7887456, "step": 7840 }, { "epoch": 3.6987270155586986, "grad_norm": 0.5324264168739319, "learning_rate": 4.890984322007763e-05, "loss": 0.072, "num_input_tokens_seen": 7892000, "step": 7845 }, { "epoch": 3.701084394153701, "grad_norm": 0.28009340167045593, "learning_rate": 4.890683685474548e-05, "loss": 0.0889, "num_input_tokens_seen": 7896544, "step": 7850 }, { "epoch": 3.7034417727487035, "grad_norm": 0.20916126668453217, "learning_rate": 4.890382644239176e-05, "loss": 0.0989, "num_input_tokens_seen": 7901536, "step": 7855 }, { "epoch": 3.705799151343706, "grad_norm": 0.159226655960083, "learning_rate": 4.890081198352609e-05, "loss": 0.1182, "num_input_tokens_seen": 7906592, "step": 7860 }, { "epoch": 3.7081565299387083, "grad_norm": 1.1027958393096924, "learning_rate": 4.889779347865876e-05, "loss": 0.17, "num_input_tokens_seen": 7911776, "step": 7865 }, { "epoch": 3.7105139085337107, "grad_norm": 1.2577162981033325, "learning_rate": 4.889477092830075e-05, "loss": 0.3424, "num_input_tokens_seen": 7917024, "step": 7870 }, { "epoch": 3.7128712871287126, "grad_norm": 0.9942617416381836, "learning_rate": 4.889174433296374e-05, "loss": 0.1809, "num_input_tokens_seen": 7922592, "step": 7875 }, { "epoch": 3.715228665723715, "grad_norm": 2.331515312194824, "learning_rate": 4.888871369316007e-05, "loss": 0.162, "num_input_tokens_seen": 7927776, "step": 7880 }, { "epoch": 3.7175860443187174, "grad_norm": 0.28262367844581604, "learning_rate": 4.888567900940278e-05, "loss": 0.0542, "num_input_tokens_seen": 7933184, "step": 7885 }, { "epoch": 3.71994342291372, "grad_norm": 1.3475018739700317, "learning_rate": 4.888264028220559e-05, "loss": 0.1217, "num_input_tokens_seen": 7937152, "step": 7890 }, { "epoch": 3.7223008015087222, "grad_norm": 0.38204899430274963, "learning_rate": 4.88795975120829e-05, "loss": 0.0829, "num_input_tokens_seen": 7942464, "step": 7895 }, { "epoch": 3.7246581801037246, "grad_norm": 0.12624794244766235, "learning_rate": 4.887655069954981e-05, "loss": 0.0709, "num_input_tokens_seen": 7946848, "step": 7900 }, { "epoch": 3.727015558698727, "grad_norm": 0.21827483177185059, "learning_rate": 4.8873499845122084e-05, "loss": 0.0626, "num_input_tokens_seen": 7951872, "step": 7905 }, { "epoch": 3.7293729372937294, "grad_norm": 1.0082550048828125, "learning_rate": 4.8870444949316186e-05, "loss": 0.1276, "num_input_tokens_seen": 7957120, "step": 7910 }, { "epoch": 3.731730315888732, "grad_norm": 0.485259085893631, "learning_rate": 4.886738601264925e-05, "loss": 0.1536, "num_input_tokens_seen": 7963616, "step": 7915 }, { "epoch": 3.7340876944837342, "grad_norm": 0.49481409788131714, "learning_rate": 4.8864323035639117e-05, "loss": 0.081, "num_input_tokens_seen": 7969248, "step": 7920 }, { "epoch": 3.7364450730787366, "grad_norm": 0.7277068495750427, "learning_rate": 4.886125601880428e-05, "loss": 0.1931, "num_input_tokens_seen": 7974656, "step": 7925 }, { "epoch": 3.738802451673739, "grad_norm": 1.0133873224258423, "learning_rate": 4.885818496266394e-05, "loss": 0.0733, "num_input_tokens_seen": 7981184, "step": 7930 }, { "epoch": 3.741159830268741, "grad_norm": 1.208622694015503, "learning_rate": 4.885510986773797e-05, "loss": 0.179, "num_input_tokens_seen": 7985856, "step": 7935 }, { "epoch": 3.7435172088637434, "grad_norm": 1.1262056827545166, "learning_rate": 4.8852030734546946e-05, "loss": 0.1332, "num_input_tokens_seen": 7993088, "step": 7940 }, { "epoch": 3.745874587458746, "grad_norm": 0.4471977651119232, "learning_rate": 4.88489475636121e-05, "loss": 0.1482, "num_input_tokens_seen": 7997760, "step": 7945 }, { "epoch": 3.748231966053748, "grad_norm": 0.5754988193511963, "learning_rate": 4.884586035545536e-05, "loss": 0.0709, "num_input_tokens_seen": 8003648, "step": 7950 }, { "epoch": 3.7505893446487506, "grad_norm": 0.3326372802257538, "learning_rate": 4.884276911059934e-05, "loss": 0.0724, "num_input_tokens_seen": 8009248, "step": 7955 }, { "epoch": 3.752946723243753, "grad_norm": 0.43617352843284607, "learning_rate": 4.8839673829567343e-05, "loss": 0.1726, "num_input_tokens_seen": 8014464, "step": 7960 }, { "epoch": 3.7553041018387554, "grad_norm": 2.224417209625244, "learning_rate": 4.883657451288334e-05, "loss": 0.118, "num_input_tokens_seen": 8018560, "step": 7965 }, { "epoch": 3.757661480433758, "grad_norm": 1.1550160646438599, "learning_rate": 4.8833471161071995e-05, "loss": 0.1693, "num_input_tokens_seen": 8023712, "step": 7970 }, { "epoch": 3.7600188590287598, "grad_norm": 0.2793251574039459, "learning_rate": 4.883036377465864e-05, "loss": 0.1595, "num_input_tokens_seen": 8028736, "step": 7975 }, { "epoch": 3.762376237623762, "grad_norm": 0.764956533908844, "learning_rate": 4.8827252354169325e-05, "loss": 0.1182, "num_input_tokens_seen": 8033632, "step": 7980 }, { "epoch": 3.7647336162187646, "grad_norm": 1.0308210849761963, "learning_rate": 4.882413690013076e-05, "loss": 0.107, "num_input_tokens_seen": 8037376, "step": 7985 }, { "epoch": 3.767090994813767, "grad_norm": 0.2752512991428375, "learning_rate": 4.882101741307031e-05, "loss": 0.1694, "num_input_tokens_seen": 8041952, "step": 7990 }, { "epoch": 3.7694483734087694, "grad_norm": 0.0820392370223999, "learning_rate": 4.8817893893516076e-05, "loss": 0.1066, "num_input_tokens_seen": 8046752, "step": 7995 }, { "epoch": 3.7718057520037718, "grad_norm": 3.1598501205444336, "learning_rate": 4.8814766341996816e-05, "loss": 0.1654, "num_input_tokens_seen": 8051136, "step": 8000 }, { "epoch": 3.774163130598774, "grad_norm": 0.3991638123989105, "learning_rate": 4.881163475904196e-05, "loss": 0.1843, "num_input_tokens_seen": 8057280, "step": 8005 }, { "epoch": 3.7765205091937766, "grad_norm": 0.12373986840248108, "learning_rate": 4.880849914518164e-05, "loss": 0.0708, "num_input_tokens_seen": 8062688, "step": 8010 }, { "epoch": 3.778877887788779, "grad_norm": 0.4261779189109802, "learning_rate": 4.8805359500946655e-05, "loss": 0.0787, "num_input_tokens_seen": 8067584, "step": 8015 }, { "epoch": 3.7812352663837814, "grad_norm": 0.12713482975959778, "learning_rate": 4.880221582686851e-05, "loss": 0.0754, "num_input_tokens_seen": 8073056, "step": 8020 }, { "epoch": 3.783592644978784, "grad_norm": 0.410176545381546, "learning_rate": 4.879906812347935e-05, "loss": 0.1237, "num_input_tokens_seen": 8077504, "step": 8025 }, { "epoch": 3.785950023573786, "grad_norm": 0.3503492474555969, "learning_rate": 4.879591639131206e-05, "loss": 0.0917, "num_input_tokens_seen": 8082720, "step": 8030 }, { "epoch": 3.7883074021687886, "grad_norm": 0.05679516866803169, "learning_rate": 4.879276063090014e-05, "loss": 0.2862, "num_input_tokens_seen": 8086944, "step": 8035 }, { "epoch": 3.7906647807637905, "grad_norm": 0.7505331635475159, "learning_rate": 4.8789600842777824e-05, "loss": 0.1976, "num_input_tokens_seen": 8092448, "step": 8040 }, { "epoch": 3.793022159358793, "grad_norm": 0.18691560626029968, "learning_rate": 4.8786437027480024e-05, "loss": 0.1515, "num_input_tokens_seen": 8097344, "step": 8045 }, { "epoch": 3.7953795379537953, "grad_norm": 0.582057535648346, "learning_rate": 4.878326918554229e-05, "loss": 0.0964, "num_input_tokens_seen": 8102016, "step": 8050 }, { "epoch": 3.7977369165487977, "grad_norm": 1.8590205907821655, "learning_rate": 4.878009731750091e-05, "loss": 0.2829, "num_input_tokens_seen": 8107456, "step": 8055 }, { "epoch": 3.8000942951438, "grad_norm": 1.6162549257278442, "learning_rate": 4.877692142389282e-05, "loss": 0.1887, "num_input_tokens_seen": 8112576, "step": 8060 }, { "epoch": 3.8024516737388026, "grad_norm": 0.01982182078063488, "learning_rate": 4.877374150525563e-05, "loss": 0.0392, "num_input_tokens_seen": 8116928, "step": 8065 }, { "epoch": 3.804809052333805, "grad_norm": 0.22487841546535492, "learning_rate": 4.877055756212767e-05, "loss": 0.1458, "num_input_tokens_seen": 8121376, "step": 8070 }, { "epoch": 3.807166430928807, "grad_norm": 0.2986264228820801, "learning_rate": 4.876736959504791e-05, "loss": 0.0816, "num_input_tokens_seen": 8125952, "step": 8075 }, { "epoch": 3.8095238095238093, "grad_norm": 0.2548672556877136, "learning_rate": 4.8764177604556024e-05, "loss": 0.1118, "num_input_tokens_seen": 8130688, "step": 8080 }, { "epoch": 3.8118811881188117, "grad_norm": 1.0483832359313965, "learning_rate": 4.876098159119236e-05, "loss": 0.1105, "num_input_tokens_seen": 8136032, "step": 8085 }, { "epoch": 3.814238566713814, "grad_norm": 0.4030119478702545, "learning_rate": 4.875778155549795e-05, "loss": 0.3174, "num_input_tokens_seen": 8141024, "step": 8090 }, { "epoch": 3.8165959453088165, "grad_norm": 0.02696315012872219, "learning_rate": 4.875457749801451e-05, "loss": 0.0775, "num_input_tokens_seen": 8146176, "step": 8095 }, { "epoch": 3.818953323903819, "grad_norm": 0.5285171270370483, "learning_rate": 4.875136941928444e-05, "loss": 0.1534, "num_input_tokens_seen": 8151232, "step": 8100 }, { "epoch": 3.8213107024988213, "grad_norm": 0.9191251993179321, "learning_rate": 4.874815731985079e-05, "loss": 0.2398, "num_input_tokens_seen": 8156192, "step": 8105 }, { "epoch": 3.8236680810938237, "grad_norm": 6.608543395996094, "learning_rate": 4.8744941200257325e-05, "loss": 0.1152, "num_input_tokens_seen": 8160640, "step": 8110 }, { "epoch": 3.826025459688826, "grad_norm": 0.16912181675434113, "learning_rate": 4.874172106104849e-05, "loss": 0.0682, "num_input_tokens_seen": 8165024, "step": 8115 }, { "epoch": 3.8283828382838285, "grad_norm": 1.1281471252441406, "learning_rate": 4.873849690276938e-05, "loss": 0.1126, "num_input_tokens_seen": 8171296, "step": 8120 }, { "epoch": 3.830740216878831, "grad_norm": 1.6148202419281006, "learning_rate": 4.873526872596581e-05, "loss": 0.192, "num_input_tokens_seen": 8176576, "step": 8125 }, { "epoch": 3.8330975954738333, "grad_norm": 0.3989116847515106, "learning_rate": 4.8732036531184247e-05, "loss": 0.2558, "num_input_tokens_seen": 8181216, "step": 8130 }, { "epoch": 3.8354549740688357, "grad_norm": 0.43936604261398315, "learning_rate": 4.8728800318971846e-05, "loss": 0.1049, "num_input_tokens_seen": 8186016, "step": 8135 }, { "epoch": 3.8378123526638377, "grad_norm": 0.09543817490339279, "learning_rate": 4.872556008987644e-05, "loss": 0.1138, "num_input_tokens_seen": 8190944, "step": 8140 }, { "epoch": 3.84016973125884, "grad_norm": 0.5906413793563843, "learning_rate": 4.872231584444654e-05, "loss": 0.2512, "num_input_tokens_seen": 8195200, "step": 8145 }, { "epoch": 3.8425271098538425, "grad_norm": 1.3755403757095337, "learning_rate": 4.871906758323136e-05, "loss": 0.2751, "num_input_tokens_seen": 8199552, "step": 8150 }, { "epoch": 3.844884488448845, "grad_norm": 1.8457008600234985, "learning_rate": 4.8715815306780764e-05, "loss": 0.2223, "num_input_tokens_seen": 8204064, "step": 8155 }, { "epoch": 3.8472418670438473, "grad_norm": 1.1066462993621826, "learning_rate": 4.871255901564531e-05, "loss": 0.1211, "num_input_tokens_seen": 8208832, "step": 8160 }, { "epoch": 3.8495992456388497, "grad_norm": 2.30025315284729, "learning_rate": 4.870929871037623e-05, "loss": 0.2138, "num_input_tokens_seen": 8213696, "step": 8165 }, { "epoch": 3.851956624233852, "grad_norm": 1.0761107206344604, "learning_rate": 4.870603439152543e-05, "loss": 0.3262, "num_input_tokens_seen": 8218048, "step": 8170 }, { "epoch": 3.854314002828854, "grad_norm": 0.5177825689315796, "learning_rate": 4.8702766059645523e-05, "loss": 0.1378, "num_input_tokens_seen": 8221856, "step": 8175 }, { "epoch": 3.8566713814238565, "grad_norm": 0.39394980669021606, "learning_rate": 4.869949371528977e-05, "loss": 0.0841, "num_input_tokens_seen": 8227584, "step": 8180 }, { "epoch": 3.859028760018859, "grad_norm": 0.600025475025177, "learning_rate": 4.869621735901213e-05, "loss": 0.16, "num_input_tokens_seen": 8233280, "step": 8185 }, { "epoch": 3.8613861386138613, "grad_norm": 1.5809367895126343, "learning_rate": 4.869293699136722e-05, "loss": 0.2135, "num_input_tokens_seen": 8237568, "step": 8190 }, { "epoch": 3.8637435172088637, "grad_norm": 0.47863295674324036, "learning_rate": 4.868965261291036e-05, "loss": 0.1426, "num_input_tokens_seen": 8243200, "step": 8195 }, { "epoch": 3.866100895803866, "grad_norm": 1.237609624862671, "learning_rate": 4.8686364224197556e-05, "loss": 0.0523, "num_input_tokens_seen": 8247520, "step": 8200 }, { "epoch": 3.8684582743988685, "grad_norm": 0.09183426201343536, "learning_rate": 4.8683071825785446e-05, "loss": 0.0815, "num_input_tokens_seen": 8252192, "step": 8205 }, { "epoch": 3.870815652993871, "grad_norm": 4.187180995941162, "learning_rate": 4.86797754182314e-05, "loss": 0.1827, "num_input_tokens_seen": 8257024, "step": 8210 }, { "epoch": 3.8731730315888733, "grad_norm": 2.250539779663086, "learning_rate": 4.8676475002093424e-05, "loss": 0.1853, "num_input_tokens_seen": 8261216, "step": 8215 }, { "epoch": 3.8755304101838757, "grad_norm": 1.9516199827194214, "learning_rate": 4.8673170577930246e-05, "loss": 0.152, "num_input_tokens_seen": 8266176, "step": 8220 }, { "epoch": 3.877887788778878, "grad_norm": 0.2674523591995239, "learning_rate": 4.866986214630123e-05, "loss": 0.1333, "num_input_tokens_seen": 8271360, "step": 8225 }, { "epoch": 3.8802451673738805, "grad_norm": 1.5045166015625, "learning_rate": 4.8666549707766446e-05, "loss": 0.1303, "num_input_tokens_seen": 8276800, "step": 8230 }, { "epoch": 3.882602545968883, "grad_norm": 0.8090329170227051, "learning_rate": 4.866323326288663e-05, "loss": 0.129, "num_input_tokens_seen": 8282560, "step": 8235 }, { "epoch": 3.884959924563885, "grad_norm": 0.04844371974468231, "learning_rate": 4.8659912812223204e-05, "loss": 0.0662, "num_input_tokens_seen": 8288480, "step": 8240 }, { "epoch": 3.8873173031588872, "grad_norm": 0.9538539052009583, "learning_rate": 4.865658835633826e-05, "loss": 0.1415, "num_input_tokens_seen": 8294592, "step": 8245 }, { "epoch": 3.8896746817538896, "grad_norm": 1.1925466060638428, "learning_rate": 4.8653259895794566e-05, "loss": 0.078, "num_input_tokens_seen": 8300192, "step": 8250 }, { "epoch": 3.892032060348892, "grad_norm": 0.9577930569648743, "learning_rate": 4.8649927431155597e-05, "loss": 0.2034, "num_input_tokens_seen": 8306720, "step": 8255 }, { "epoch": 3.8943894389438944, "grad_norm": 0.2834290564060211, "learning_rate": 4.864659096298546e-05, "loss": 0.2702, "num_input_tokens_seen": 8312096, "step": 8260 }, { "epoch": 3.896746817538897, "grad_norm": 1.618336796760559, "learning_rate": 4.8643250491848965e-05, "loss": 0.1834, "num_input_tokens_seen": 8320704, "step": 8265 }, { "epoch": 3.8991041961338992, "grad_norm": 1.0026477575302124, "learning_rate": 4.86399060183116e-05, "loss": 0.1057, "num_input_tokens_seen": 8325120, "step": 8270 }, { "epoch": 3.901461574728901, "grad_norm": 0.05653489753603935, "learning_rate": 4.863655754293953e-05, "loss": 0.1582, "num_input_tokens_seen": 8330944, "step": 8275 }, { "epoch": 3.9038189533239036, "grad_norm": 2.122206926345825, "learning_rate": 4.86332050662996e-05, "loss": 0.342, "num_input_tokens_seen": 8335968, "step": 8280 }, { "epoch": 3.906176331918906, "grad_norm": 1.0514278411865234, "learning_rate": 4.862984858895931e-05, "loss": 0.1218, "num_input_tokens_seen": 8340576, "step": 8285 }, { "epoch": 3.9085337105139084, "grad_norm": 2.0891830921173096, "learning_rate": 4.8626488111486876e-05, "loss": 0.1876, "num_input_tokens_seen": 8346080, "step": 8290 }, { "epoch": 3.910891089108911, "grad_norm": 0.4042682647705078, "learning_rate": 4.862312363445116e-05, "loss": 0.3836, "num_input_tokens_seen": 8352000, "step": 8295 }, { "epoch": 3.913248467703913, "grad_norm": 0.14918646216392517, "learning_rate": 4.861975515842171e-05, "loss": 0.2326, "num_input_tokens_seen": 8356064, "step": 8300 }, { "epoch": 3.9156058462989156, "grad_norm": 0.7060531973838806, "learning_rate": 4.861638268396875e-05, "loss": 0.1057, "num_input_tokens_seen": 8360448, "step": 8305 }, { "epoch": 3.917963224893918, "grad_norm": 0.27337974309921265, "learning_rate": 4.861300621166318e-05, "loss": 0.1694, "num_input_tokens_seen": 8364896, "step": 8310 }, { "epoch": 3.9203206034889204, "grad_norm": 0.07089154422283173, "learning_rate": 4.860962574207659e-05, "loss": 0.1217, "num_input_tokens_seen": 8370592, "step": 8315 }, { "epoch": 3.922677982083923, "grad_norm": 1.1015715599060059, "learning_rate": 4.860624127578124e-05, "loss": 0.0849, "num_input_tokens_seen": 8374688, "step": 8320 }, { "epoch": 3.9250353606789252, "grad_norm": 1.6290366649627686, "learning_rate": 4.8602852813350035e-05, "loss": 0.2887, "num_input_tokens_seen": 8379872, "step": 8325 }, { "epoch": 3.9273927392739276, "grad_norm": 0.4549381732940674, "learning_rate": 4.859946035535661e-05, "loss": 0.1058, "num_input_tokens_seen": 8383808, "step": 8330 }, { "epoch": 3.92975011786893, "grad_norm": 1.5806037187576294, "learning_rate": 4.859606390237525e-05, "loss": 0.1802, "num_input_tokens_seen": 8388800, "step": 8335 }, { "epoch": 3.932107496463932, "grad_norm": 0.21602532267570496, "learning_rate": 4.85926634549809e-05, "loss": 0.1043, "num_input_tokens_seen": 8393312, "step": 8340 }, { "epoch": 3.9344648750589344, "grad_norm": 0.4116676151752472, "learning_rate": 4.858925901374921e-05, "loss": 0.0446, "num_input_tokens_seen": 8398144, "step": 8345 }, { "epoch": 3.936822253653937, "grad_norm": 0.7999024987220764, "learning_rate": 4.8585850579256486e-05, "loss": 0.1293, "num_input_tokens_seen": 8402720, "step": 8350 }, { "epoch": 3.939179632248939, "grad_norm": 0.5710982084274292, "learning_rate": 4.858243815207973e-05, "loss": 0.311, "num_input_tokens_seen": 8408512, "step": 8355 }, { "epoch": 3.9415370108439416, "grad_norm": 2.175614833831787, "learning_rate": 4.85790217327966e-05, "loss": 0.1104, "num_input_tokens_seen": 8413120, "step": 8360 }, { "epoch": 3.943894389438944, "grad_norm": 0.6347023248672485, "learning_rate": 4.8575601321985445e-05, "loss": 0.1682, "num_input_tokens_seen": 8417920, "step": 8365 }, { "epoch": 3.9462517680339464, "grad_norm": 0.8636303544044495, "learning_rate": 4.857217692022528e-05, "loss": 0.1139, "num_input_tokens_seen": 8422368, "step": 8370 }, { "epoch": 3.9486091466289484, "grad_norm": 0.21748194098472595, "learning_rate": 4.856874852809579e-05, "loss": 0.041, "num_input_tokens_seen": 8428160, "step": 8375 }, { "epoch": 3.9509665252239508, "grad_norm": 1.4201349020004272, "learning_rate": 4.856531614617734e-05, "loss": 0.28, "num_input_tokens_seen": 8432448, "step": 8380 }, { "epoch": 3.953323903818953, "grad_norm": 0.09799809008836746, "learning_rate": 4.856187977505099e-05, "loss": 0.1667, "num_input_tokens_seen": 8436928, "step": 8385 }, { "epoch": 3.9556812824139556, "grad_norm": 1.6549581289291382, "learning_rate": 4.855843941529846e-05, "loss": 0.2371, "num_input_tokens_seen": 8441408, "step": 8390 }, { "epoch": 3.958038661008958, "grad_norm": 0.5352523326873779, "learning_rate": 4.855499506750213e-05, "loss": 0.1364, "num_input_tokens_seen": 8445120, "step": 8395 }, { "epoch": 3.9603960396039604, "grad_norm": 0.14884565770626068, "learning_rate": 4.8551546732245065e-05, "loss": 0.1149, "num_input_tokens_seen": 8449632, "step": 8400 }, { "epoch": 3.9627534181989628, "grad_norm": 0.22934868931770325, "learning_rate": 4.854809441011103e-05, "loss": 0.1417, "num_input_tokens_seen": 8455392, "step": 8405 }, { "epoch": 3.965110796793965, "grad_norm": 0.6022189259529114, "learning_rate": 4.854463810168444e-05, "loss": 0.0891, "num_input_tokens_seen": 8461216, "step": 8410 }, { "epoch": 3.9674681753889676, "grad_norm": 1.8404167890548706, "learning_rate": 4.854117780755039e-05, "loss": 0.201, "num_input_tokens_seen": 8465888, "step": 8415 }, { "epoch": 3.96982555398397, "grad_norm": 0.38038018345832825, "learning_rate": 4.853771352829463e-05, "loss": 0.2873, "num_input_tokens_seen": 8471136, "step": 8420 }, { "epoch": 3.9721829325789724, "grad_norm": 1.8254234790802002, "learning_rate": 4.853424526450362e-05, "loss": 0.1103, "num_input_tokens_seen": 8477088, "step": 8425 }, { "epoch": 3.9745403111739748, "grad_norm": 0.7091248631477356, "learning_rate": 4.8530773016764475e-05, "loss": 0.0895, "num_input_tokens_seen": 8481440, "step": 8430 }, { "epoch": 3.976897689768977, "grad_norm": 0.023419177159667015, "learning_rate": 4.852729678566499e-05, "loss": 0.0871, "num_input_tokens_seen": 8486176, "step": 8435 }, { "epoch": 3.979255068363979, "grad_norm": 0.9720723628997803, "learning_rate": 4.852381657179363e-05, "loss": 0.2232, "num_input_tokens_seen": 8491648, "step": 8440 }, { "epoch": 3.9816124469589815, "grad_norm": 0.1673985719680786, "learning_rate": 4.8520332375739544e-05, "loss": 0.1358, "num_input_tokens_seen": 8496512, "step": 8445 }, { "epoch": 3.983969825553984, "grad_norm": 0.19207431375980377, "learning_rate": 4.851684419809253e-05, "loss": 0.2107, "num_input_tokens_seen": 8502208, "step": 8450 }, { "epoch": 3.9863272041489863, "grad_norm": 1.1425502300262451, "learning_rate": 4.851335203944308e-05, "loss": 0.3449, "num_input_tokens_seen": 8507200, "step": 8455 }, { "epoch": 3.9886845827439887, "grad_norm": 2.0318024158477783, "learning_rate": 4.8509855900382374e-05, "loss": 0.1471, "num_input_tokens_seen": 8511648, "step": 8460 }, { "epoch": 3.991041961338991, "grad_norm": 0.04858009144663811, "learning_rate": 4.850635578150224e-05, "loss": 0.2282, "num_input_tokens_seen": 8519520, "step": 8465 }, { "epoch": 3.9933993399339935, "grad_norm": 0.5182242393493652, "learning_rate": 4.850285168339518e-05, "loss": 0.2205, "num_input_tokens_seen": 8523488, "step": 8470 }, { "epoch": 3.9957567185289955, "grad_norm": 0.12257784605026245, "learning_rate": 4.849934360665439e-05, "loss": 0.0561, "num_input_tokens_seen": 8528576, "step": 8475 }, { "epoch": 3.998114097123998, "grad_norm": 1.1434047222137451, "learning_rate": 4.849583155187373e-05, "loss": 0.1735, "num_input_tokens_seen": 8533824, "step": 8480 }, { "epoch": 4.0, "eval_loss": 0.1634756624698639, "eval_runtime": 15.0889, "eval_samples_per_second": 62.496, "eval_steps_per_second": 15.641, "num_input_tokens_seen": 8537088, "step": 8484 }, { "epoch": 4.000471475719, "grad_norm": 1.8274320363998413, "learning_rate": 4.849231551964771e-05, "loss": 0.1965, "num_input_tokens_seen": 8537824, "step": 8485 }, { "epoch": 4.002828854314003, "grad_norm": 1.0583857297897339, "learning_rate": 4.848879551057156e-05, "loss": 0.1621, "num_input_tokens_seen": 8542784, "step": 8490 }, { "epoch": 4.005186232909005, "grad_norm": 1.1589388847351074, "learning_rate": 4.848527152524114e-05, "loss": 0.0765, "num_input_tokens_seen": 8547456, "step": 8495 }, { "epoch": 4.0075436115040075, "grad_norm": 0.39449986815452576, "learning_rate": 4.8481743564253015e-05, "loss": 0.1657, "num_input_tokens_seen": 8552704, "step": 8500 }, { "epoch": 4.00990099009901, "grad_norm": 0.19008523225784302, "learning_rate": 4.84782116282044e-05, "loss": 0.0898, "num_input_tokens_seen": 8558112, "step": 8505 }, { "epoch": 4.012258368694012, "grad_norm": 1.4095361232757568, "learning_rate": 4.8474675717693195e-05, "loss": 0.2254, "num_input_tokens_seen": 8563200, "step": 8510 }, { "epoch": 4.014615747289015, "grad_norm": 0.36356112360954285, "learning_rate": 4.847113583331796e-05, "loss": 0.0925, "num_input_tokens_seen": 8568128, "step": 8515 }, { "epoch": 4.016973125884017, "grad_norm": 0.4676603376865387, "learning_rate": 4.846759197567796e-05, "loss": 0.075, "num_input_tokens_seen": 8572736, "step": 8520 }, { "epoch": 4.0193305044790195, "grad_norm": 0.17291948199272156, "learning_rate": 4.846404414537308e-05, "loss": 0.1458, "num_input_tokens_seen": 8577248, "step": 8525 }, { "epoch": 4.021687883074022, "grad_norm": 0.16193003952503204, "learning_rate": 4.846049234300393e-05, "loss": 0.3683, "num_input_tokens_seen": 8581120, "step": 8530 }, { "epoch": 4.024045261669024, "grad_norm": 1.327894926071167, "learning_rate": 4.845693656917176e-05, "loss": 0.1239, "num_input_tokens_seen": 8586176, "step": 8535 }, { "epoch": 4.026402640264027, "grad_norm": 0.3270277678966522, "learning_rate": 4.8453376824478513e-05, "loss": 0.0665, "num_input_tokens_seen": 8590816, "step": 8540 }, { "epoch": 4.028760018859029, "grad_norm": 0.5343582630157471, "learning_rate": 4.844981310952678e-05, "loss": 0.0731, "num_input_tokens_seen": 8595104, "step": 8545 }, { "epoch": 4.0311173974540315, "grad_norm": 1.8013288974761963, "learning_rate": 4.8446245424919845e-05, "loss": 0.272, "num_input_tokens_seen": 8601632, "step": 8550 }, { "epoch": 4.033474776049033, "grad_norm": 0.2354414016008377, "learning_rate": 4.844267377126165e-05, "loss": 0.1878, "num_input_tokens_seen": 8605984, "step": 8555 }, { "epoch": 4.035832154644035, "grad_norm": 0.46214911341667175, "learning_rate": 4.8439098149156827e-05, "loss": 0.1971, "num_input_tokens_seen": 8611168, "step": 8560 }, { "epoch": 4.038189533239038, "grad_norm": 0.9620015621185303, "learning_rate": 4.8435518559210654e-05, "loss": 0.1107, "num_input_tokens_seen": 8615552, "step": 8565 }, { "epoch": 4.04054691183404, "grad_norm": 1.3327794075012207, "learning_rate": 4.84319350020291e-05, "loss": 0.1752, "num_input_tokens_seen": 8621024, "step": 8570 }, { "epoch": 4.042904290429043, "grad_norm": 0.3425038158893585, "learning_rate": 4.8428347478218804e-05, "loss": 0.1639, "num_input_tokens_seen": 8626016, "step": 8575 }, { "epoch": 4.045261669024045, "grad_norm": 1.5725321769714355, "learning_rate": 4.8424755988387074e-05, "loss": 0.083, "num_input_tokens_seen": 8630528, "step": 8580 }, { "epoch": 4.0476190476190474, "grad_norm": 1.9764653444290161, "learning_rate": 4.842116053314188e-05, "loss": 0.1134, "num_input_tokens_seen": 8636096, "step": 8585 }, { "epoch": 4.04997642621405, "grad_norm": 0.11421607434749603, "learning_rate": 4.8417561113091884e-05, "loss": 0.0751, "num_input_tokens_seen": 8640608, "step": 8590 }, { "epoch": 4.052333804809052, "grad_norm": 1.3636332750320435, "learning_rate": 4.841395772884639e-05, "loss": 0.1547, "num_input_tokens_seen": 8645664, "step": 8595 }, { "epoch": 4.054691183404055, "grad_norm": 2.379464626312256, "learning_rate": 4.84103503810154e-05, "loss": 0.3297, "num_input_tokens_seen": 8650272, "step": 8600 }, { "epoch": 4.057048561999057, "grad_norm": 1.031407117843628, "learning_rate": 4.840673907020958e-05, "loss": 0.1711, "num_input_tokens_seen": 8655360, "step": 8605 }, { "epoch": 4.0594059405940595, "grad_norm": 0.25133779644966125, "learning_rate": 4.840312379704026e-05, "loss": 0.0795, "num_input_tokens_seen": 8660160, "step": 8610 }, { "epoch": 4.061763319189062, "grad_norm": 0.8456221222877502, "learning_rate": 4.839950456211944e-05, "loss": 0.2458, "num_input_tokens_seen": 8664672, "step": 8615 }, { "epoch": 4.064120697784064, "grad_norm": 0.03223741054534912, "learning_rate": 4.839588136605979e-05, "loss": 0.1017, "num_input_tokens_seen": 8670176, "step": 8620 }, { "epoch": 4.066478076379067, "grad_norm": 0.12201737612485886, "learning_rate": 4.839225420947468e-05, "loss": 0.0894, "num_input_tokens_seen": 8675744, "step": 8625 }, { "epoch": 4.068835454974069, "grad_norm": 0.8847796320915222, "learning_rate": 4.8388623092978104e-05, "loss": 0.1317, "num_input_tokens_seen": 8680448, "step": 8630 }, { "epoch": 4.0711928335690715, "grad_norm": 0.09482472389936447, "learning_rate": 4.838498801718475e-05, "loss": 0.133, "num_input_tokens_seen": 8685216, "step": 8635 }, { "epoch": 4.073550212164074, "grad_norm": 1.7073782682418823, "learning_rate": 4.838134898270998e-05, "loss": 0.1856, "num_input_tokens_seen": 8690592, "step": 8640 }, { "epoch": 4.075907590759076, "grad_norm": 0.9717026948928833, "learning_rate": 4.837770599016983e-05, "loss": 0.0551, "num_input_tokens_seen": 8695680, "step": 8645 }, { "epoch": 4.078264969354079, "grad_norm": 0.9998600482940674, "learning_rate": 4.837405904018098e-05, "loss": 0.1265, "num_input_tokens_seen": 8701216, "step": 8650 }, { "epoch": 4.08062234794908, "grad_norm": 0.023757943883538246, "learning_rate": 4.83704081333608e-05, "loss": 0.1299, "num_input_tokens_seen": 8706176, "step": 8655 }, { "epoch": 4.082979726544083, "grad_norm": 0.1863415241241455, "learning_rate": 4.836675327032734e-05, "loss": 0.1241, "num_input_tokens_seen": 8711040, "step": 8660 }, { "epoch": 4.085337105139085, "grad_norm": 0.7578458189964294, "learning_rate": 4.836309445169929e-05, "loss": 0.1239, "num_input_tokens_seen": 8715520, "step": 8665 }, { "epoch": 4.087694483734087, "grad_norm": 0.4882408082485199, "learning_rate": 4.835943167809603e-05, "loss": 0.2394, "num_input_tokens_seen": 8720256, "step": 8670 }, { "epoch": 4.09005186232909, "grad_norm": 1.366208553314209, "learning_rate": 4.83557649501376e-05, "loss": 0.1008, "num_input_tokens_seen": 8724256, "step": 8675 }, { "epoch": 4.092409240924092, "grad_norm": 2.7730329036712646, "learning_rate": 4.8352094268444735e-05, "loss": 0.3123, "num_input_tokens_seen": 8730048, "step": 8680 }, { "epoch": 4.094766619519095, "grad_norm": 1.5284287929534912, "learning_rate": 4.834841963363881e-05, "loss": 0.1856, "num_input_tokens_seen": 8734656, "step": 8685 }, { "epoch": 4.097123998114097, "grad_norm": 0.9945083856582642, "learning_rate": 4.834474104634187e-05, "loss": 0.1884, "num_input_tokens_seen": 8739488, "step": 8690 }, { "epoch": 4.099481376709099, "grad_norm": 2.508100748062134, "learning_rate": 4.834105850717663e-05, "loss": 0.2026, "num_input_tokens_seen": 8744928, "step": 8695 }, { "epoch": 4.101838755304102, "grad_norm": 0.2749837636947632, "learning_rate": 4.833737201676651e-05, "loss": 0.1638, "num_input_tokens_seen": 8751872, "step": 8700 }, { "epoch": 4.104196133899104, "grad_norm": 0.5593211054801941, "learning_rate": 4.833368157573555e-05, "loss": 0.1451, "num_input_tokens_seen": 8756448, "step": 8705 }, { "epoch": 4.106553512494107, "grad_norm": 0.1762612909078598, "learning_rate": 4.832998718470847e-05, "loss": 0.1175, "num_input_tokens_seen": 8760832, "step": 8710 }, { "epoch": 4.108910891089109, "grad_norm": 0.13771241903305054, "learning_rate": 4.832628884431069e-05, "loss": 0.1404, "num_input_tokens_seen": 8764832, "step": 8715 }, { "epoch": 4.111268269684111, "grad_norm": 0.1987694501876831, "learning_rate": 4.832258655516827e-05, "loss": 0.474, "num_input_tokens_seen": 8769312, "step": 8720 }, { "epoch": 4.113625648279114, "grad_norm": 1.446470856666565, "learning_rate": 4.831888031790793e-05, "loss": 0.0742, "num_input_tokens_seen": 8775360, "step": 8725 }, { "epoch": 4.115983026874116, "grad_norm": 0.446193128824234, "learning_rate": 4.8315170133157095e-05, "loss": 0.0618, "num_input_tokens_seen": 8781536, "step": 8730 }, { "epoch": 4.118340405469119, "grad_norm": 1.447982907295227, "learning_rate": 4.831145600154382e-05, "loss": 0.1828, "num_input_tokens_seen": 8787328, "step": 8735 }, { "epoch": 4.120697784064121, "grad_norm": 0.1386314034461975, "learning_rate": 4.8307737923696855e-05, "loss": 0.0875, "num_input_tokens_seen": 8793376, "step": 8740 }, { "epoch": 4.123055162659123, "grad_norm": 1.8444417715072632, "learning_rate": 4.8304015900245595e-05, "loss": 0.1568, "num_input_tokens_seen": 8798304, "step": 8745 }, { "epoch": 4.125412541254126, "grad_norm": 0.5042266845703125, "learning_rate": 4.830028993182013e-05, "loss": 0.0575, "num_input_tokens_seen": 8804800, "step": 8750 }, { "epoch": 4.127769919849127, "grad_norm": 1.0625125169754028, "learning_rate": 4.82965600190512e-05, "loss": 0.08, "num_input_tokens_seen": 8809152, "step": 8755 }, { "epoch": 4.13012729844413, "grad_norm": 0.04300742968916893, "learning_rate": 4.82928261625702e-05, "loss": 0.1233, "num_input_tokens_seen": 8813536, "step": 8760 }, { "epoch": 4.132484677039132, "grad_norm": 0.9597614407539368, "learning_rate": 4.828908836300922e-05, "loss": 0.1418, "num_input_tokens_seen": 8818048, "step": 8765 }, { "epoch": 4.1348420556341345, "grad_norm": 0.4455181956291199, "learning_rate": 4.8285346621001015e-05, "loss": 0.3286, "num_input_tokens_seen": 8823008, "step": 8770 }, { "epoch": 4.137199434229137, "grad_norm": 0.2795201539993286, "learning_rate": 4.8281600937178986e-05, "loss": 0.1444, "num_input_tokens_seen": 8829376, "step": 8775 }, { "epoch": 4.139556812824139, "grad_norm": 0.9337305426597595, "learning_rate": 4.827785131217721e-05, "loss": 0.1069, "num_input_tokens_seen": 8833376, "step": 8780 }, { "epoch": 4.141914191419142, "grad_norm": 0.8191799521446228, "learning_rate": 4.827409774663045e-05, "loss": 0.0613, "num_input_tokens_seen": 8838368, "step": 8785 }, { "epoch": 4.144271570014144, "grad_norm": 0.3109506666660309, "learning_rate": 4.827034024117412e-05, "loss": 0.0873, "num_input_tokens_seen": 8844096, "step": 8790 }, { "epoch": 4.1466289486091465, "grad_norm": 1.4944645166397095, "learning_rate": 4.826657879644429e-05, "loss": 0.166, "num_input_tokens_seen": 8849440, "step": 8795 }, { "epoch": 4.148986327204149, "grad_norm": 1.4455256462097168, "learning_rate": 4.826281341307771e-05, "loss": 0.1293, "num_input_tokens_seen": 8854144, "step": 8800 }, { "epoch": 4.151343705799151, "grad_norm": 0.8834394812583923, "learning_rate": 4.82590440917118e-05, "loss": 0.1509, "num_input_tokens_seen": 8858496, "step": 8805 }, { "epoch": 4.153701084394154, "grad_norm": 0.7375301718711853, "learning_rate": 4.8255270832984645e-05, "loss": 0.2528, "num_input_tokens_seen": 8863360, "step": 8810 }, { "epoch": 4.156058462989156, "grad_norm": 1.4095454216003418, "learning_rate": 4.8251493637534985e-05, "loss": 0.3044, "num_input_tokens_seen": 8868384, "step": 8815 }, { "epoch": 4.158415841584159, "grad_norm": 0.09209619462490082, "learning_rate": 4.824771250600224e-05, "loss": 0.074, "num_input_tokens_seen": 8872096, "step": 8820 }, { "epoch": 4.160773220179161, "grad_norm": 0.9985911250114441, "learning_rate": 4.8243927439026495e-05, "loss": 0.0885, "num_input_tokens_seen": 8876576, "step": 8825 }, { "epoch": 4.163130598774163, "grad_norm": 0.5235750675201416, "learning_rate": 4.8240138437248494e-05, "loss": 0.2534, "num_input_tokens_seen": 8882976, "step": 8830 }, { "epoch": 4.165487977369166, "grad_norm": 0.08607152849435806, "learning_rate": 4.823634550130965e-05, "loss": 0.0888, "num_input_tokens_seen": 8888064, "step": 8835 }, { "epoch": 4.167845355964168, "grad_norm": 0.39690133929252625, "learning_rate": 4.823254863185204e-05, "loss": 0.08, "num_input_tokens_seen": 8894048, "step": 8840 }, { "epoch": 4.170202734559171, "grad_norm": 0.20904457569122314, "learning_rate": 4.8228747829518417e-05, "loss": 0.1664, "num_input_tokens_seen": 8899008, "step": 8845 }, { "epoch": 4.172560113154173, "grad_norm": 0.9678085446357727, "learning_rate": 4.822494309495219e-05, "loss": 0.1804, "num_input_tokens_seen": 8903456, "step": 8850 }, { "epoch": 4.174917491749175, "grad_norm": 0.434995174407959, "learning_rate": 4.822113442879743e-05, "loss": 0.0826, "num_input_tokens_seen": 8908384, "step": 8855 }, { "epoch": 4.177274870344177, "grad_norm": 0.7292838096618652, "learning_rate": 4.821732183169888e-05, "loss": 0.1332, "num_input_tokens_seen": 8912640, "step": 8860 }, { "epoch": 4.179632248939179, "grad_norm": 0.29603418707847595, "learning_rate": 4.821350530430196e-05, "loss": 0.1171, "num_input_tokens_seen": 8917792, "step": 8865 }, { "epoch": 4.181989627534182, "grad_norm": 0.5298593044281006, "learning_rate": 4.820968484725273e-05, "loss": 0.0177, "num_input_tokens_seen": 8922048, "step": 8870 }, { "epoch": 4.184347006129184, "grad_norm": 0.329758882522583, "learning_rate": 4.8205860461197936e-05, "loss": 0.042, "num_input_tokens_seen": 8927904, "step": 8875 }, { "epoch": 4.1867043847241865, "grad_norm": 0.2991575598716736, "learning_rate": 4.820203214678497e-05, "loss": 0.2164, "num_input_tokens_seen": 8932800, "step": 8880 }, { "epoch": 4.189061763319189, "grad_norm": 0.10284300893545151, "learning_rate": 4.8198199904661924e-05, "loss": 0.2574, "num_input_tokens_seen": 8937312, "step": 8885 }, { "epoch": 4.191419141914191, "grad_norm": 2.3998069763183594, "learning_rate": 4.819436373547751e-05, "loss": 0.2434, "num_input_tokens_seen": 8943008, "step": 8890 }, { "epoch": 4.193776520509194, "grad_norm": 1.5603253841400146, "learning_rate": 4.8190523639881135e-05, "loss": 0.3451, "num_input_tokens_seen": 8949184, "step": 8895 }, { "epoch": 4.196133899104196, "grad_norm": 1.4576668739318848, "learning_rate": 4.818667961852286e-05, "loss": 0.2036, "num_input_tokens_seen": 8954816, "step": 8900 }, { "epoch": 4.1984912776991985, "grad_norm": 1.8508065938949585, "learning_rate": 4.818283167205341e-05, "loss": 0.1716, "num_input_tokens_seen": 8960064, "step": 8905 }, { "epoch": 4.200848656294201, "grad_norm": 0.5906205773353577, "learning_rate": 4.817897980112418e-05, "loss": 0.1072, "num_input_tokens_seen": 8964288, "step": 8910 }, { "epoch": 4.203206034889203, "grad_norm": 0.5555269718170166, "learning_rate": 4.817512400638723e-05, "loss": 0.1785, "num_input_tokens_seen": 8969888, "step": 8915 }, { "epoch": 4.205563413484206, "grad_norm": 1.300323247909546, "learning_rate": 4.8171264288495274e-05, "loss": 0.2101, "num_input_tokens_seen": 8974400, "step": 8920 }, { "epoch": 4.207920792079208, "grad_norm": 0.4227427840232849, "learning_rate": 4.81674006481017e-05, "loss": 0.0857, "num_input_tokens_seen": 8980416, "step": 8925 }, { "epoch": 4.2102781706742105, "grad_norm": 0.03428430110216141, "learning_rate": 4.8163533085860556e-05, "loss": 0.0442, "num_input_tokens_seen": 8986880, "step": 8930 }, { "epoch": 4.212635549269213, "grad_norm": 1.6141562461853027, "learning_rate": 4.815966160242656e-05, "loss": 0.2381, "num_input_tokens_seen": 8991872, "step": 8935 }, { "epoch": 4.214992927864215, "grad_norm": 0.5299023389816284, "learning_rate": 4.8155786198455075e-05, "loss": 0.3307, "num_input_tokens_seen": 8996960, "step": 8940 }, { "epoch": 4.217350306459218, "grad_norm": 0.304774671792984, "learning_rate": 4.815190687460216e-05, "loss": 0.14, "num_input_tokens_seen": 9001152, "step": 8945 }, { "epoch": 4.21970768505422, "grad_norm": 0.6436876058578491, "learning_rate": 4.81480236315245e-05, "loss": 0.173, "num_input_tokens_seen": 9007360, "step": 8950 }, { "epoch": 4.222065063649222, "grad_norm": 0.23942764103412628, "learning_rate": 4.8144136469879476e-05, "loss": 0.2162, "num_input_tokens_seen": 9013280, "step": 8955 }, { "epoch": 4.224422442244224, "grad_norm": 0.45638221502304077, "learning_rate": 4.814024539032511e-05, "loss": 0.0734, "num_input_tokens_seen": 9018528, "step": 8960 }, { "epoch": 4.226779820839226, "grad_norm": 1.2402273416519165, "learning_rate": 4.813635039352009e-05, "loss": 0.1097, "num_input_tokens_seen": 9023264, "step": 8965 }, { "epoch": 4.229137199434229, "grad_norm": 0.18933217227458954, "learning_rate": 4.8132451480123794e-05, "loss": 0.1649, "num_input_tokens_seen": 9028672, "step": 8970 }, { "epoch": 4.231494578029231, "grad_norm": 0.4346315562725067, "learning_rate": 4.8128548650796226e-05, "loss": 0.1033, "num_input_tokens_seen": 9032704, "step": 8975 }, { "epoch": 4.233851956624234, "grad_norm": 0.14908376336097717, "learning_rate": 4.8124641906198065e-05, "loss": 0.1919, "num_input_tokens_seen": 9037664, "step": 8980 }, { "epoch": 4.236209335219236, "grad_norm": 0.5082288384437561, "learning_rate": 4.812073124699067e-05, "loss": 0.2465, "num_input_tokens_seen": 9042880, "step": 8985 }, { "epoch": 4.238566713814238, "grad_norm": 1.153926134109497, "learning_rate": 4.811681667383604e-05, "loss": 0.1394, "num_input_tokens_seen": 9048832, "step": 8990 }, { "epoch": 4.240924092409241, "grad_norm": 1.1640336513519287, "learning_rate": 4.811289818739685e-05, "loss": 0.1623, "num_input_tokens_seen": 9053984, "step": 8995 }, { "epoch": 4.243281471004243, "grad_norm": 0.17140044271945953, "learning_rate": 4.810897578833643e-05, "loss": 0.0819, "num_input_tokens_seen": 9059584, "step": 9000 }, { "epoch": 4.245638849599246, "grad_norm": 0.4395681619644165, "learning_rate": 4.8105049477318775e-05, "loss": 0.0592, "num_input_tokens_seen": 9064480, "step": 9005 }, { "epoch": 4.247996228194248, "grad_norm": 1.4319825172424316, "learning_rate": 4.810111925500855e-05, "loss": 0.1775, "num_input_tokens_seen": 9070528, "step": 9010 }, { "epoch": 4.2503536067892504, "grad_norm": 0.144849494099617, "learning_rate": 4.8097185122071064e-05, "loss": 0.2534, "num_input_tokens_seen": 9075360, "step": 9015 }, { "epoch": 4.252710985384253, "grad_norm": 0.2738487720489502, "learning_rate": 4.809324707917231e-05, "loss": 0.1635, "num_input_tokens_seen": 9079808, "step": 9020 }, { "epoch": 4.255068363979255, "grad_norm": 0.1751665621995926, "learning_rate": 4.808930512697892e-05, "loss": 0.0689, "num_input_tokens_seen": 9083744, "step": 9025 }, { "epoch": 4.257425742574258, "grad_norm": 0.06429672986268997, "learning_rate": 4.8085359266158216e-05, "loss": 0.1105, "num_input_tokens_seen": 9088256, "step": 9030 }, { "epoch": 4.25978312116926, "grad_norm": 0.5579308271408081, "learning_rate": 4.808140949737815e-05, "loss": 0.1627, "num_input_tokens_seen": 9093504, "step": 9035 }, { "epoch": 4.2621404997642625, "grad_norm": 1.2689809799194336, "learning_rate": 4.807745582130736e-05, "loss": 0.122, "num_input_tokens_seen": 9098528, "step": 9040 }, { "epoch": 4.264497878359265, "grad_norm": 0.26324668526649475, "learning_rate": 4.8073498238615125e-05, "loss": 0.1046, "num_input_tokens_seen": 9103296, "step": 9045 }, { "epoch": 4.266855256954267, "grad_norm": 0.07574823498725891, "learning_rate": 4.806953674997141e-05, "loss": 0.1367, "num_input_tokens_seen": 9108064, "step": 9050 }, { "epoch": 4.26921263554927, "grad_norm": 0.18972627818584442, "learning_rate": 4.806557135604682e-05, "loss": 0.1599, "num_input_tokens_seen": 9112320, "step": 9055 }, { "epoch": 4.271570014144271, "grad_norm": 0.0545755997300148, "learning_rate": 4.806160205751263e-05, "loss": 0.0521, "num_input_tokens_seen": 9118368, "step": 9060 }, { "epoch": 4.273927392739274, "grad_norm": 0.7887183427810669, "learning_rate": 4.805762885504077e-05, "loss": 0.1896, "num_input_tokens_seen": 9124000, "step": 9065 }, { "epoch": 4.276284771334276, "grad_norm": 1.9966166019439697, "learning_rate": 4.8053651749303854e-05, "loss": 0.3093, "num_input_tokens_seen": 9129472, "step": 9070 }, { "epoch": 4.278642149929278, "grad_norm": 1.0268770456314087, "learning_rate": 4.804967074097511e-05, "loss": 0.1147, "num_input_tokens_seen": 9133952, "step": 9075 }, { "epoch": 4.280999528524281, "grad_norm": 0.8966324925422668, "learning_rate": 4.804568583072848e-05, "loss": 0.2381, "num_input_tokens_seen": 9138944, "step": 9080 }, { "epoch": 4.283356907119283, "grad_norm": 3.419788122177124, "learning_rate": 4.804169701923853e-05, "loss": 0.2199, "num_input_tokens_seen": 9144352, "step": 9085 }, { "epoch": 4.285714285714286, "grad_norm": 0.15741965174674988, "learning_rate": 4.8037704307180496e-05, "loss": 0.0638, "num_input_tokens_seen": 9148896, "step": 9090 }, { "epoch": 4.288071664309288, "grad_norm": 0.6709905862808228, "learning_rate": 4.803370769523029e-05, "loss": 0.104, "num_input_tokens_seen": 9153472, "step": 9095 }, { "epoch": 4.29042904290429, "grad_norm": 1.6656626462936401, "learning_rate": 4.802970718406446e-05, "loss": 0.3898, "num_input_tokens_seen": 9159392, "step": 9100 }, { "epoch": 4.292786421499293, "grad_norm": 1.4597253799438477, "learning_rate": 4.802570277436022e-05, "loss": 0.1301, "num_input_tokens_seen": 9164032, "step": 9105 }, { "epoch": 4.295143800094295, "grad_norm": 1.4270923137664795, "learning_rate": 4.802169446679546e-05, "loss": 0.2116, "num_input_tokens_seen": 9168480, "step": 9110 }, { "epoch": 4.297501178689298, "grad_norm": 0.20729032158851624, "learning_rate": 4.8017682262048714e-05, "loss": 0.0671, "num_input_tokens_seen": 9172928, "step": 9115 }, { "epoch": 4.2998585572843, "grad_norm": 0.33458834886550903, "learning_rate": 4.801366616079917e-05, "loss": 0.2382, "num_input_tokens_seen": 9177504, "step": 9120 }, { "epoch": 4.302215935879302, "grad_norm": 0.14317156374454498, "learning_rate": 4.80096461637267e-05, "loss": 0.1393, "num_input_tokens_seen": 9183584, "step": 9125 }, { "epoch": 4.304573314474305, "grad_norm": 1.7265021800994873, "learning_rate": 4.800562227151182e-05, "loss": 0.2302, "num_input_tokens_seen": 9188640, "step": 9130 }, { "epoch": 4.306930693069307, "grad_norm": 0.36420926451683044, "learning_rate": 4.80015944848357e-05, "loss": 0.091, "num_input_tokens_seen": 9194016, "step": 9135 }, { "epoch": 4.30928807166431, "grad_norm": 0.06192987784743309, "learning_rate": 4.799756280438017e-05, "loss": 0.1134, "num_input_tokens_seen": 9199040, "step": 9140 }, { "epoch": 4.311645450259312, "grad_norm": 0.3976285755634308, "learning_rate": 4.799352723082775e-05, "loss": 0.2248, "num_input_tokens_seen": 9205472, "step": 9145 }, { "epoch": 4.314002828854314, "grad_norm": 0.5860492587089539, "learning_rate": 4.7989487764861566e-05, "loss": 0.1024, "num_input_tokens_seen": 9209600, "step": 9150 }, { "epoch": 4.316360207449316, "grad_norm": 0.2724934220314026, "learning_rate": 4.798544440716544e-05, "loss": 0.0709, "num_input_tokens_seen": 9214496, "step": 9155 }, { "epoch": 4.318717586044318, "grad_norm": 1.1359585523605347, "learning_rate": 4.798139715842386e-05, "loss": 0.1886, "num_input_tokens_seen": 9218912, "step": 9160 }, { "epoch": 4.321074964639321, "grad_norm": 1.1758365631103516, "learning_rate": 4.797734601932193e-05, "loss": 0.0961, "num_input_tokens_seen": 9223232, "step": 9165 }, { "epoch": 4.323432343234323, "grad_norm": 0.8130978941917419, "learning_rate": 4.797329099054546e-05, "loss": 0.1314, "num_input_tokens_seen": 9228224, "step": 9170 }, { "epoch": 4.3257897218293255, "grad_norm": 0.4061158001422882, "learning_rate": 4.796923207278088e-05, "loss": 0.1205, "num_input_tokens_seen": 9232768, "step": 9175 }, { "epoch": 4.328147100424328, "grad_norm": 0.0985974371433258, "learning_rate": 4.7965169266715315e-05, "loss": 0.0734, "num_input_tokens_seen": 9238080, "step": 9180 }, { "epoch": 4.33050447901933, "grad_norm": 0.7933114171028137, "learning_rate": 4.796110257303652e-05, "loss": 0.3569, "num_input_tokens_seen": 9243296, "step": 9185 }, { "epoch": 4.332861857614333, "grad_norm": 0.8273686766624451, "learning_rate": 4.795703199243291e-05, "loss": 0.1727, "num_input_tokens_seen": 9249440, "step": 9190 }, { "epoch": 4.335219236209335, "grad_norm": 2.0426526069641113, "learning_rate": 4.7952957525593575e-05, "loss": 0.1574, "num_input_tokens_seen": 9255392, "step": 9195 }, { "epoch": 4.3375766148043375, "grad_norm": 0.34255146980285645, "learning_rate": 4.794887917320825e-05, "loss": 0.1194, "num_input_tokens_seen": 9260768, "step": 9200 }, { "epoch": 4.33993399339934, "grad_norm": 0.3343595564365387, "learning_rate": 4.794479693596733e-05, "loss": 0.2798, "num_input_tokens_seen": 9266144, "step": 9205 }, { "epoch": 4.342291371994342, "grad_norm": 0.3682311475276947, "learning_rate": 4.794071081456187e-05, "loss": 0.0722, "num_input_tokens_seen": 9270336, "step": 9210 }, { "epoch": 4.344648750589345, "grad_norm": 1.1890957355499268, "learning_rate": 4.793662080968358e-05, "loss": 0.1928, "num_input_tokens_seen": 9275200, "step": 9215 }, { "epoch": 4.347006129184347, "grad_norm": 0.22157913446426392, "learning_rate": 4.793252692202483e-05, "loss": 0.0456, "num_input_tokens_seen": 9279360, "step": 9220 }, { "epoch": 4.3493635077793495, "grad_norm": 1.4174158573150635, "learning_rate": 4.792842915227864e-05, "loss": 0.2146, "num_input_tokens_seen": 9284544, "step": 9225 }, { "epoch": 4.351720886374352, "grad_norm": 0.314785361289978, "learning_rate": 4.79243275011387e-05, "loss": 0.3047, "num_input_tokens_seen": 9289184, "step": 9230 }, { "epoch": 4.354078264969354, "grad_norm": 0.6687234044075012, "learning_rate": 4.792022196929935e-05, "loss": 0.0921, "num_input_tokens_seen": 9294208, "step": 9235 }, { "epoch": 4.356435643564357, "grad_norm": 0.6518214344978333, "learning_rate": 4.7916112557455584e-05, "loss": 0.0361, "num_input_tokens_seen": 9298656, "step": 9240 }, { "epoch": 4.358793022159359, "grad_norm": 1.8953502178192139, "learning_rate": 4.791199926630305e-05, "loss": 0.1758, "num_input_tokens_seen": 9304960, "step": 9245 }, { "epoch": 4.3611504007543616, "grad_norm": 0.49266019463539124, "learning_rate": 4.7907882096538066e-05, "loss": 0.0824, "num_input_tokens_seen": 9310496, "step": 9250 }, { "epoch": 4.363507779349364, "grad_norm": 0.13749316334724426, "learning_rate": 4.79037610488576e-05, "loss": 0.1547, "num_input_tokens_seen": 9314528, "step": 9255 }, { "epoch": 4.3658651579443655, "grad_norm": 0.27417218685150146, "learning_rate": 4.789963612395927e-05, "loss": 0.0799, "num_input_tokens_seen": 9319936, "step": 9260 }, { "epoch": 4.368222536539368, "grad_norm": 0.43414485454559326, "learning_rate": 4.789550732254135e-05, "loss": 0.09, "num_input_tokens_seen": 9324608, "step": 9265 }, { "epoch": 4.37057991513437, "grad_norm": 0.3902275264263153, "learning_rate": 4.78913746453028e-05, "loss": 0.2163, "num_input_tokens_seen": 9329568, "step": 9270 }, { "epoch": 4.372937293729373, "grad_norm": 0.9540889859199524, "learning_rate": 4.788723809294318e-05, "loss": 0.22, "num_input_tokens_seen": 9333888, "step": 9275 }, { "epoch": 4.375294672324375, "grad_norm": 1.3237961530685425, "learning_rate": 4.7883097666162764e-05, "loss": 0.2085, "num_input_tokens_seen": 9338528, "step": 9280 }, { "epoch": 4.3776520509193775, "grad_norm": 1.3972797393798828, "learning_rate": 4.787895336566244e-05, "loss": 0.2884, "num_input_tokens_seen": 9343488, "step": 9285 }, { "epoch": 4.38000942951438, "grad_norm": 2.0951335430145264, "learning_rate": 4.787480519214378e-05, "loss": 0.2699, "num_input_tokens_seen": 9348256, "step": 9290 }, { "epoch": 4.382366808109382, "grad_norm": 1.3825088739395142, "learning_rate": 4.7870653146308984e-05, "loss": 0.0716, "num_input_tokens_seen": 9353216, "step": 9295 }, { "epoch": 4.384724186704385, "grad_norm": 0.31798315048217773, "learning_rate": 4.786649722886094e-05, "loss": 0.1922, "num_input_tokens_seen": 9358144, "step": 9300 }, { "epoch": 4.387081565299387, "grad_norm": 0.9143671989440918, "learning_rate": 4.786233744050317e-05, "loss": 0.1072, "num_input_tokens_seen": 9363072, "step": 9305 }, { "epoch": 4.3894389438943895, "grad_norm": 0.11697910726070404, "learning_rate": 4.785817378193984e-05, "loss": 0.1475, "num_input_tokens_seen": 9370080, "step": 9310 }, { "epoch": 4.391796322489392, "grad_norm": 0.02660522051155567, "learning_rate": 4.785400625387581e-05, "loss": 0.1062, "num_input_tokens_seen": 9375424, "step": 9315 }, { "epoch": 4.394153701084394, "grad_norm": 0.39190277457237244, "learning_rate": 4.784983485701655e-05, "loss": 0.0941, "num_input_tokens_seen": 9380512, "step": 9320 }, { "epoch": 4.396511079679397, "grad_norm": 0.39662569761276245, "learning_rate": 4.784565959206822e-05, "loss": 0.1701, "num_input_tokens_seen": 9385024, "step": 9325 }, { "epoch": 4.398868458274399, "grad_norm": 0.45532384514808655, "learning_rate": 4.784148045973762e-05, "loss": 0.1462, "num_input_tokens_seen": 9388832, "step": 9330 }, { "epoch": 4.4012258368694015, "grad_norm": 1.4341028928756714, "learning_rate": 4.78372974607322e-05, "loss": 0.2319, "num_input_tokens_seen": 9393952, "step": 9335 }, { "epoch": 4.403583215464404, "grad_norm": 2.9854886531829834, "learning_rate": 4.7833110595760076e-05, "loss": 0.3165, "num_input_tokens_seen": 9398496, "step": 9340 }, { "epoch": 4.405940594059406, "grad_norm": 0.7912357449531555, "learning_rate": 4.7828919865530016e-05, "loss": 0.1058, "num_input_tokens_seen": 9402848, "step": 9345 }, { "epoch": 4.408297972654409, "grad_norm": 0.6797986030578613, "learning_rate": 4.782472527075144e-05, "loss": 0.1046, "num_input_tokens_seen": 9407520, "step": 9350 }, { "epoch": 4.41065535124941, "grad_norm": 0.12037723511457443, "learning_rate": 4.7820526812134416e-05, "loss": 0.1903, "num_input_tokens_seen": 9412096, "step": 9355 }, { "epoch": 4.413012729844413, "grad_norm": 0.4890792667865753, "learning_rate": 4.781632449038966e-05, "loss": 0.2708, "num_input_tokens_seen": 9417696, "step": 9360 }, { "epoch": 4.415370108439415, "grad_norm": 0.5832732319831848, "learning_rate": 4.781211830622858e-05, "loss": 0.2023, "num_input_tokens_seen": 9423008, "step": 9365 }, { "epoch": 4.417727487034417, "grad_norm": 0.4624345600605011, "learning_rate": 4.78079082603632e-05, "loss": 0.0874, "num_input_tokens_seen": 9427488, "step": 9370 }, { "epoch": 4.42008486562942, "grad_norm": 1.0847690105438232, "learning_rate": 4.78036943535062e-05, "loss": 0.2436, "num_input_tokens_seen": 9432352, "step": 9375 }, { "epoch": 4.422442244224422, "grad_norm": 0.7690204381942749, "learning_rate": 4.779947658637093e-05, "loss": 0.1236, "num_input_tokens_seen": 9437184, "step": 9380 }, { "epoch": 4.424799622819425, "grad_norm": 0.4266703426837921, "learning_rate": 4.7795254959671396e-05, "loss": 0.0893, "num_input_tokens_seen": 9443264, "step": 9385 }, { "epoch": 4.427157001414427, "grad_norm": 0.9745933413505554, "learning_rate": 4.779102947412223e-05, "loss": 0.095, "num_input_tokens_seen": 9447392, "step": 9390 }, { "epoch": 4.429514380009429, "grad_norm": 1.3291326761245728, "learning_rate": 4.7786800130438745e-05, "loss": 0.088, "num_input_tokens_seen": 9452064, "step": 9395 }, { "epoch": 4.431871758604432, "grad_norm": 0.28301510214805603, "learning_rate": 4.7782566929336895e-05, "loss": 0.131, "num_input_tokens_seen": 9456960, "step": 9400 }, { "epoch": 4.434229137199434, "grad_norm": 0.08496025949716568, "learning_rate": 4.7778329871533296e-05, "loss": 0.1404, "num_input_tokens_seen": 9461856, "step": 9405 }, { "epoch": 4.436586515794437, "grad_norm": 1.001509428024292, "learning_rate": 4.77740889577452e-05, "loss": 0.1663, "num_input_tokens_seen": 9466848, "step": 9410 }, { "epoch": 4.438943894389439, "grad_norm": 0.9867052435874939, "learning_rate": 4.7769844188690526e-05, "loss": 0.1845, "num_input_tokens_seen": 9471648, "step": 9415 }, { "epoch": 4.441301272984441, "grad_norm": 1.2905893325805664, "learning_rate": 4.776559556508784e-05, "loss": 0.1443, "num_input_tokens_seen": 9476896, "step": 9420 }, { "epoch": 4.443658651579444, "grad_norm": 0.2924599051475525, "learning_rate": 4.776134308765636e-05, "loss": 0.1308, "num_input_tokens_seen": 9481056, "step": 9425 }, { "epoch": 4.446016030174446, "grad_norm": 1.0576609373092651, "learning_rate": 4.775708675711597e-05, "loss": 0.0873, "num_input_tokens_seen": 9486592, "step": 9430 }, { "epoch": 4.448373408769449, "grad_norm": 0.45076611638069153, "learning_rate": 4.7752826574187174e-05, "loss": 0.1183, "num_input_tokens_seen": 9491136, "step": 9435 }, { "epoch": 4.450730787364451, "grad_norm": 0.9000917673110962, "learning_rate": 4.7748562539591167e-05, "loss": 0.1737, "num_input_tokens_seen": 9495488, "step": 9440 }, { "epoch": 4.4530881659594534, "grad_norm": 0.08386589586734772, "learning_rate": 4.774429465404977e-05, "loss": 0.1514, "num_input_tokens_seen": 9500064, "step": 9445 }, { "epoch": 4.455445544554456, "grad_norm": 1.2613074779510498, "learning_rate": 4.774002291828546e-05, "loss": 0.1461, "num_input_tokens_seen": 9504960, "step": 9450 }, { "epoch": 4.457802923149458, "grad_norm": 0.2217211276292801, "learning_rate": 4.773574733302138e-05, "loss": 0.1504, "num_input_tokens_seen": 9511840, "step": 9455 }, { "epoch": 4.46016030174446, "grad_norm": 1.244493842124939, "learning_rate": 4.77314678989813e-05, "loss": 0.1877, "num_input_tokens_seen": 9521920, "step": 9460 }, { "epoch": 4.462517680339462, "grad_norm": 0.9655229449272156, "learning_rate": 4.772718461688967e-05, "loss": 0.119, "num_input_tokens_seen": 9527552, "step": 9465 }, { "epoch": 4.464875058934465, "grad_norm": 0.31775400042533875, "learning_rate": 4.772289748747156e-05, "loss": 0.2653, "num_input_tokens_seen": 9532960, "step": 9470 }, { "epoch": 4.467232437529467, "grad_norm": 0.41784945130348206, "learning_rate": 4.7718606511452734e-05, "loss": 0.0612, "num_input_tokens_seen": 9537952, "step": 9475 }, { "epoch": 4.469589816124469, "grad_norm": 0.9374456405639648, "learning_rate": 4.7714311689559556e-05, "loss": 0.1325, "num_input_tokens_seen": 9542848, "step": 9480 }, { "epoch": 4.471947194719472, "grad_norm": 0.7971554398536682, "learning_rate": 4.7710013022519073e-05, "loss": 0.2062, "num_input_tokens_seen": 9547680, "step": 9485 }, { "epoch": 4.474304573314474, "grad_norm": 1.2011522054672241, "learning_rate": 4.770571051105899e-05, "loss": 0.1433, "num_input_tokens_seen": 9552352, "step": 9490 }, { "epoch": 4.476661951909477, "grad_norm": 0.883989691734314, "learning_rate": 4.770140415590762e-05, "loss": 0.2407, "num_input_tokens_seen": 9559232, "step": 9495 }, { "epoch": 4.479019330504479, "grad_norm": 1.973559856414795, "learning_rate": 4.769709395779399e-05, "loss": 0.1524, "num_input_tokens_seen": 9563968, "step": 9500 }, { "epoch": 4.481376709099481, "grad_norm": 0.17767764627933502, "learning_rate": 4.7692779917447726e-05, "loss": 0.0842, "num_input_tokens_seen": 9568800, "step": 9505 }, { "epoch": 4.483734087694484, "grad_norm": 0.8013259172439575, "learning_rate": 4.7688462035599126e-05, "loss": 0.1927, "num_input_tokens_seen": 9573472, "step": 9510 }, { "epoch": 4.486091466289486, "grad_norm": 1.1027202606201172, "learning_rate": 4.768414031297912e-05, "loss": 0.1398, "num_input_tokens_seen": 9578528, "step": 9515 }, { "epoch": 4.488448844884489, "grad_norm": 0.5809659957885742, "learning_rate": 4.7679814750319324e-05, "loss": 0.2447, "num_input_tokens_seen": 9583680, "step": 9520 }, { "epoch": 4.490806223479491, "grad_norm": 0.04837752506136894, "learning_rate": 4.767548534835197e-05, "loss": 0.1341, "num_input_tokens_seen": 9588352, "step": 9525 }, { "epoch": 4.493163602074493, "grad_norm": 0.7863717079162598, "learning_rate": 4.767115210780995e-05, "loss": 0.0642, "num_input_tokens_seen": 9593600, "step": 9530 }, { "epoch": 4.495520980669496, "grad_norm": 0.3539692759513855, "learning_rate": 4.7666815029426816e-05, "loss": 0.0503, "num_input_tokens_seen": 9599296, "step": 9535 }, { "epoch": 4.497878359264498, "grad_norm": 1.7959192991256714, "learning_rate": 4.766247411393676e-05, "loss": 0.1123, "num_input_tokens_seen": 9603296, "step": 9540 }, { "epoch": 4.500235737859501, "grad_norm": 0.12510089576244354, "learning_rate": 4.7658129362074624e-05, "loss": 0.1146, "num_input_tokens_seen": 9609088, "step": 9545 }, { "epoch": 4.502593116454502, "grad_norm": 1.7054736614227295, "learning_rate": 4.76537807745759e-05, "loss": 0.1784, "num_input_tokens_seen": 9613920, "step": 9550 }, { "epoch": 4.5049504950495045, "grad_norm": 1.9322929382324219, "learning_rate": 4.764942835217672e-05, "loss": 0.0685, "num_input_tokens_seen": 9619040, "step": 9555 }, { "epoch": 4.507307873644507, "grad_norm": 0.2110191434621811, "learning_rate": 4.76450720956139e-05, "loss": 0.1175, "num_input_tokens_seen": 9624576, "step": 9560 }, { "epoch": 4.509665252239509, "grad_norm": 0.4452887177467346, "learning_rate": 4.764071200562485e-05, "loss": 0.191, "num_input_tokens_seen": 9629280, "step": 9565 }, { "epoch": 4.512022630834512, "grad_norm": 1.0692963600158691, "learning_rate": 4.763634808294768e-05, "loss": 0.2026, "num_input_tokens_seen": 9633984, "step": 9570 }, { "epoch": 4.514380009429514, "grad_norm": 2.005345582962036, "learning_rate": 4.763198032832113e-05, "loss": 0.207, "num_input_tokens_seen": 9638944, "step": 9575 }, { "epoch": 4.5167373880245165, "grad_norm": 0.19964776933193207, "learning_rate": 4.762760874248458e-05, "loss": 0.2082, "num_input_tokens_seen": 9643072, "step": 9580 }, { "epoch": 4.519094766619519, "grad_norm": 1.2369886636734009, "learning_rate": 4.7623233326178055e-05, "loss": 0.1249, "num_input_tokens_seen": 9647200, "step": 9585 }, { "epoch": 4.521452145214521, "grad_norm": 0.5614862442016602, "learning_rate": 4.761885408014225e-05, "loss": 0.0968, "num_input_tokens_seen": 9652960, "step": 9590 }, { "epoch": 4.523809523809524, "grad_norm": 0.9966623783111572, "learning_rate": 4.76144710051185e-05, "loss": 0.1287, "num_input_tokens_seen": 9657888, "step": 9595 }, { "epoch": 4.526166902404526, "grad_norm": 1.1273269653320312, "learning_rate": 4.761008410184878e-05, "loss": 0.1938, "num_input_tokens_seen": 9664320, "step": 9600 }, { "epoch": 4.5285242809995285, "grad_norm": 1.118461012840271, "learning_rate": 4.760569337107572e-05, "loss": 0.1708, "num_input_tokens_seen": 9670176, "step": 9605 }, { "epoch": 4.530881659594531, "grad_norm": 0.29374268651008606, "learning_rate": 4.7601298813542595e-05, "loss": 0.1065, "num_input_tokens_seen": 9674176, "step": 9610 }, { "epoch": 4.533239038189533, "grad_norm": 0.15725526213645935, "learning_rate": 4.759690042999333e-05, "loss": 0.0378, "num_input_tokens_seen": 9679808, "step": 9615 }, { "epoch": 4.535596416784536, "grad_norm": 1.144447922706604, "learning_rate": 4.7592498221172496e-05, "loss": 0.1916, "num_input_tokens_seen": 9685504, "step": 9620 }, { "epoch": 4.537953795379538, "grad_norm": 0.24205167591571808, "learning_rate": 4.7588092187825305e-05, "loss": 0.1394, "num_input_tokens_seen": 9690112, "step": 9625 }, { "epoch": 4.5403111739745405, "grad_norm": 0.3860180675983429, "learning_rate": 4.7583682330697635e-05, "loss": 0.1071, "num_input_tokens_seen": 9695392, "step": 9630 }, { "epoch": 4.542668552569543, "grad_norm": 0.5802816152572632, "learning_rate": 4.7579268650536e-05, "loss": 0.1999, "num_input_tokens_seen": 9699936, "step": 9635 }, { "epoch": 4.545025931164545, "grad_norm": 0.22682791948318481, "learning_rate": 4.757485114808755e-05, "loss": 0.0896, "num_input_tokens_seen": 9704736, "step": 9640 }, { "epoch": 4.547383309759548, "grad_norm": 0.7978226542472839, "learning_rate": 4.757042982410011e-05, "loss": 0.0916, "num_input_tokens_seen": 9710016, "step": 9645 }, { "epoch": 4.54974068835455, "grad_norm": 1.3679903745651245, "learning_rate": 4.7566004679322114e-05, "loss": 0.1967, "num_input_tokens_seen": 9714816, "step": 9650 }, { "epoch": 4.5520980669495525, "grad_norm": 0.16713783144950867, "learning_rate": 4.7561575714502676e-05, "loss": 0.1904, "num_input_tokens_seen": 9719744, "step": 9655 }, { "epoch": 4.554455445544555, "grad_norm": 0.29042792320251465, "learning_rate": 4.755714293039155e-05, "loss": 0.0831, "num_input_tokens_seen": 9724960, "step": 9660 }, { "epoch": 4.5568128241395565, "grad_norm": 0.9099878668785095, "learning_rate": 4.755270632773912e-05, "loss": 0.1512, "num_input_tokens_seen": 9729408, "step": 9665 }, { "epoch": 4.559170202734559, "grad_norm": 3.848856210708618, "learning_rate": 4.754826590729643e-05, "loss": 0.2232, "num_input_tokens_seen": 9735360, "step": 9670 }, { "epoch": 4.561527581329561, "grad_norm": 2.256215810775757, "learning_rate": 4.754382166981517e-05, "loss": 0.1192, "num_input_tokens_seen": 9740736, "step": 9675 }, { "epoch": 4.563884959924564, "grad_norm": 0.09107338637113571, "learning_rate": 4.753937361604767e-05, "loss": 0.0456, "num_input_tokens_seen": 9745312, "step": 9680 }, { "epoch": 4.566242338519566, "grad_norm": 0.3605114221572876, "learning_rate": 4.753492174674692e-05, "loss": 0.1333, "num_input_tokens_seen": 9749472, "step": 9685 }, { "epoch": 4.5685997171145685, "grad_norm": 0.8072890043258667, "learning_rate": 4.753046606266654e-05, "loss": 0.1384, "num_input_tokens_seen": 9753568, "step": 9690 }, { "epoch": 4.570957095709571, "grad_norm": 0.6812373995780945, "learning_rate": 4.75260065645608e-05, "loss": 0.222, "num_input_tokens_seen": 9758432, "step": 9695 }, { "epoch": 4.573314474304573, "grad_norm": 0.32853782176971436, "learning_rate": 4.752154325318461e-05, "loss": 0.0822, "num_input_tokens_seen": 9763872, "step": 9700 }, { "epoch": 4.575671852899576, "grad_norm": 2.922748565673828, "learning_rate": 4.7517076129293556e-05, "loss": 0.1888, "num_input_tokens_seen": 9769248, "step": 9705 }, { "epoch": 4.578029231494578, "grad_norm": 0.7510973215103149, "learning_rate": 4.7512605193643824e-05, "loss": 0.0957, "num_input_tokens_seen": 9773600, "step": 9710 }, { "epoch": 4.5803866100895805, "grad_norm": 1.3974735736846924, "learning_rate": 4.750813044699227e-05, "loss": 0.2465, "num_input_tokens_seen": 9777696, "step": 9715 }, { "epoch": 4.582743988684583, "grad_norm": 0.19502347707748413, "learning_rate": 4.7503651890096404e-05, "loss": 0.092, "num_input_tokens_seen": 9781984, "step": 9720 }, { "epoch": 4.585101367279585, "grad_norm": 0.4161659777164459, "learning_rate": 4.749916952371436e-05, "loss": 0.1451, "num_input_tokens_seen": 9786912, "step": 9725 }, { "epoch": 4.587458745874588, "grad_norm": 0.6786653995513916, "learning_rate": 4.749468334860493e-05, "loss": 0.2738, "num_input_tokens_seen": 9791840, "step": 9730 }, { "epoch": 4.58981612446959, "grad_norm": 0.3378766179084778, "learning_rate": 4.749019336552756e-05, "loss": 0.1109, "num_input_tokens_seen": 9797184, "step": 9735 }, { "epoch": 4.5921735030645925, "grad_norm": 0.10608580708503723, "learning_rate": 4.7485699575242296e-05, "loss": 0.1596, "num_input_tokens_seen": 9802016, "step": 9740 }, { "epoch": 4.594530881659595, "grad_norm": 1.858498454093933, "learning_rate": 4.7481201978509894e-05, "loss": 0.1625, "num_input_tokens_seen": 9807776, "step": 9745 }, { "epoch": 4.596888260254596, "grad_norm": 0.2102442979812622, "learning_rate": 4.747670057609169e-05, "loss": 0.1742, "num_input_tokens_seen": 9813312, "step": 9750 }, { "epoch": 4.599245638849599, "grad_norm": 0.6488466262817383, "learning_rate": 4.747219536874973e-05, "loss": 0.0988, "num_input_tokens_seen": 9818144, "step": 9755 }, { "epoch": 4.601603017444601, "grad_norm": 0.36530396342277527, "learning_rate": 4.746768635724663e-05, "loss": 0.0906, "num_input_tokens_seen": 9822816, "step": 9760 }, { "epoch": 4.603960396039604, "grad_norm": 3.5857045650482178, "learning_rate": 4.746317354234573e-05, "loss": 0.1614, "num_input_tokens_seen": 9827136, "step": 9765 }, { "epoch": 4.606317774634606, "grad_norm": 0.7102648615837097, "learning_rate": 4.745865692481094e-05, "loss": 0.1285, "num_input_tokens_seen": 9831872, "step": 9770 }, { "epoch": 4.608675153229608, "grad_norm": 0.6920474767684937, "learning_rate": 4.745413650540686e-05, "loss": 0.1068, "num_input_tokens_seen": 9836704, "step": 9775 }, { "epoch": 4.611032531824611, "grad_norm": 0.7151291370391846, "learning_rate": 4.744961228489872e-05, "loss": 0.1801, "num_input_tokens_seen": 9840928, "step": 9780 }, { "epoch": 4.613389910419613, "grad_norm": 0.8975223302841187, "learning_rate": 4.74450842640524e-05, "loss": 0.2983, "num_input_tokens_seen": 9845312, "step": 9785 }, { "epoch": 4.615747289014616, "grad_norm": 1.3164138793945312, "learning_rate": 4.7440552443634404e-05, "loss": 0.2053, "num_input_tokens_seen": 9850336, "step": 9790 }, { "epoch": 4.618104667609618, "grad_norm": 0.18563230335712433, "learning_rate": 4.743601682441189e-05, "loss": 0.1273, "num_input_tokens_seen": 9854816, "step": 9795 }, { "epoch": 4.62046204620462, "grad_norm": 0.9865877032279968, "learning_rate": 4.743147740715268e-05, "loss": 0.1348, "num_input_tokens_seen": 9858560, "step": 9800 }, { "epoch": 4.622819424799623, "grad_norm": 0.14371705055236816, "learning_rate": 4.7426934192625204e-05, "loss": 0.0532, "num_input_tokens_seen": 9864096, "step": 9805 }, { "epoch": 4.625176803394625, "grad_norm": 0.06028349697589874, "learning_rate": 4.742238718159856e-05, "loss": 0.0276, "num_input_tokens_seen": 9869024, "step": 9810 }, { "epoch": 4.627534181989628, "grad_norm": 1.6921931505203247, "learning_rate": 4.7417836374842474e-05, "loss": 0.103, "num_input_tokens_seen": 9874432, "step": 9815 }, { "epoch": 4.62989156058463, "grad_norm": 0.07230605185031891, "learning_rate": 4.741328177312733e-05, "loss": 0.234, "num_input_tokens_seen": 9878400, "step": 9820 }, { "epoch": 4.632248939179632, "grad_norm": 0.9555423855781555, "learning_rate": 4.7408723377224126e-05, "loss": 0.1443, "num_input_tokens_seen": 9884256, "step": 9825 }, { "epoch": 4.634606317774635, "grad_norm": 1.7542903423309326, "learning_rate": 4.7404161187904545e-05, "loss": 0.1771, "num_input_tokens_seen": 9889184, "step": 9830 }, { "epoch": 4.636963696369637, "grad_norm": 0.12387141585350037, "learning_rate": 4.7399595205940874e-05, "loss": 0.345, "num_input_tokens_seen": 9893472, "step": 9835 }, { "epoch": 4.63932107496464, "grad_norm": 1.5476411581039429, "learning_rate": 4.739502543210605e-05, "loss": 0.1615, "num_input_tokens_seen": 9897472, "step": 9840 }, { "epoch": 4.641678453559642, "grad_norm": 0.19583404064178467, "learning_rate": 4.739045186717367e-05, "loss": 0.1583, "num_input_tokens_seen": 9901568, "step": 9845 }, { "epoch": 4.644035832154644, "grad_norm": 0.1946127712726593, "learning_rate": 4.738587451191796e-05, "loss": 0.1622, "num_input_tokens_seen": 9907584, "step": 9850 }, { "epoch": 4.646393210749647, "grad_norm": 1.8107507228851318, "learning_rate": 4.738129336711378e-05, "loss": 0.3234, "num_input_tokens_seen": 9912160, "step": 9855 }, { "epoch": 4.648750589344649, "grad_norm": 0.35269907116889954, "learning_rate": 4.737670843353665e-05, "loss": 0.2356, "num_input_tokens_seen": 9916800, "step": 9860 }, { "epoch": 4.651107967939651, "grad_norm": 0.07220911979675293, "learning_rate": 4.7372119711962714e-05, "loss": 0.0101, "num_input_tokens_seen": 9922016, "step": 9865 }, { "epoch": 4.653465346534653, "grad_norm": 1.7843037843704224, "learning_rate": 4.736752720316877e-05, "loss": 0.2442, "num_input_tokens_seen": 9926720, "step": 9870 }, { "epoch": 4.655822725129656, "grad_norm": 0.7884724140167236, "learning_rate": 4.7362930907932253e-05, "loss": 0.1327, "num_input_tokens_seen": 9931392, "step": 9875 }, { "epoch": 4.658180103724658, "grad_norm": 2.4373326301574707, "learning_rate": 4.7358330827031225e-05, "loss": 0.2585, "num_input_tokens_seen": 9936576, "step": 9880 }, { "epoch": 4.66053748231966, "grad_norm": 1.0743416547775269, "learning_rate": 4.735372696124442e-05, "loss": 0.2107, "num_input_tokens_seen": 9941824, "step": 9885 }, { "epoch": 4.662894860914663, "grad_norm": 1.5634130239486694, "learning_rate": 4.734911931135118e-05, "loss": 0.1623, "num_input_tokens_seen": 9946688, "step": 9890 }, { "epoch": 4.665252239509665, "grad_norm": 0.259087473154068, "learning_rate": 4.734450787813151e-05, "loss": 0.1036, "num_input_tokens_seen": 9953984, "step": 9895 }, { "epoch": 4.667609618104668, "grad_norm": 0.12912669777870178, "learning_rate": 4.7339892662366045e-05, "loss": 0.1872, "num_input_tokens_seen": 9960032, "step": 9900 }, { "epoch": 4.66996699669967, "grad_norm": 0.6939592957496643, "learning_rate": 4.733527366483605e-05, "loss": 0.1641, "num_input_tokens_seen": 9964704, "step": 9905 }, { "epoch": 4.672324375294672, "grad_norm": 0.3876437544822693, "learning_rate": 4.733065088632347e-05, "loss": 0.1336, "num_input_tokens_seen": 9970368, "step": 9910 }, { "epoch": 4.674681753889675, "grad_norm": 0.11184883117675781, "learning_rate": 4.732602432761085e-05, "loss": 0.1568, "num_input_tokens_seen": 9974880, "step": 9915 }, { "epoch": 4.677039132484677, "grad_norm": 0.42780935764312744, "learning_rate": 4.7321393989481376e-05, "loss": 0.1011, "num_input_tokens_seen": 9979936, "step": 9920 }, { "epoch": 4.67939651107968, "grad_norm": 0.24721106886863708, "learning_rate": 4.73167598727189e-05, "loss": 0.1624, "num_input_tokens_seen": 9986048, "step": 9925 }, { "epoch": 4.681753889674682, "grad_norm": 0.13474886119365692, "learning_rate": 4.7312121978107896e-05, "loss": 0.2243, "num_input_tokens_seen": 9991072, "step": 9930 }, { "epoch": 4.684111268269684, "grad_norm": 0.5385289788246155, "learning_rate": 4.7307480306433473e-05, "loss": 0.0935, "num_input_tokens_seen": 9995904, "step": 9935 }, { "epoch": 4.686468646864687, "grad_norm": 0.7615151405334473, "learning_rate": 4.7302834858481394e-05, "loss": 0.1694, "num_input_tokens_seen": 10000000, "step": 9940 }, { "epoch": 4.688826025459689, "grad_norm": 0.28284725546836853, "learning_rate": 4.7298185635038056e-05, "loss": 0.2033, "num_input_tokens_seen": 10005248, "step": 9945 }, { "epoch": 4.691183404054691, "grad_norm": 1.7686316967010498, "learning_rate": 4.7293532636890495e-05, "loss": 0.2245, "num_input_tokens_seen": 10010688, "step": 9950 }, { "epoch": 4.693540782649693, "grad_norm": 1.1319146156311035, "learning_rate": 4.728887586482638e-05, "loss": 0.1247, "num_input_tokens_seen": 10015392, "step": 9955 }, { "epoch": 4.6958981612446955, "grad_norm": 0.320870041847229, "learning_rate": 4.728421531963402e-05, "loss": 0.0482, "num_input_tokens_seen": 10020512, "step": 9960 }, { "epoch": 4.698255539839698, "grad_norm": 0.2280290275812149, "learning_rate": 4.7279551002102376e-05, "loss": 0.0635, "num_input_tokens_seen": 10025344, "step": 9965 }, { "epoch": 4.7006129184347, "grad_norm": 0.02154664695262909, "learning_rate": 4.727488291302102e-05, "loss": 0.097, "num_input_tokens_seen": 10032320, "step": 9970 }, { "epoch": 4.702970297029703, "grad_norm": 0.25050023198127747, "learning_rate": 4.727021105318021e-05, "loss": 0.1363, "num_input_tokens_seen": 10037408, "step": 9975 }, { "epoch": 4.705327675624705, "grad_norm": 0.7166488170623779, "learning_rate": 4.726553542337079e-05, "loss": 0.1572, "num_input_tokens_seen": 10041984, "step": 9980 }, { "epoch": 4.7076850542197075, "grad_norm": 1.346739411354065, "learning_rate": 4.726085602438426e-05, "loss": 0.2597, "num_input_tokens_seen": 10047296, "step": 9985 }, { "epoch": 4.71004243281471, "grad_norm": 1.8834975957870483, "learning_rate": 4.7256172857012784e-05, "loss": 0.1839, "num_input_tokens_seen": 10052608, "step": 9990 }, { "epoch": 4.712399811409712, "grad_norm": 1.8370715379714966, "learning_rate": 4.725148592204913e-05, "loss": 0.1493, "num_input_tokens_seen": 10058656, "step": 9995 }, { "epoch": 4.714757190004715, "grad_norm": 0.426844984292984, "learning_rate": 4.724679522028672e-05, "loss": 0.0758, "num_input_tokens_seen": 10063008, "step": 10000 }, { "epoch": 4.717114568599717, "grad_norm": 1.3399688005447388, "learning_rate": 4.72421007525196e-05, "loss": 0.0851, "num_input_tokens_seen": 10067008, "step": 10005 }, { "epoch": 4.7194719471947195, "grad_norm": 0.621439516544342, "learning_rate": 4.7237402519542486e-05, "loss": 0.0897, "num_input_tokens_seen": 10071360, "step": 10010 }, { "epoch": 4.721829325789722, "grad_norm": 0.8452421426773071, "learning_rate": 4.723270052215068e-05, "loss": 0.0497, "num_input_tokens_seen": 10075840, "step": 10015 }, { "epoch": 4.724186704384724, "grad_norm": 0.8381116390228271, "learning_rate": 4.7227994761140174e-05, "loss": 0.1143, "num_input_tokens_seen": 10081952, "step": 10020 }, { "epoch": 4.726544082979727, "grad_norm": 0.31455451250076294, "learning_rate": 4.722328523730756e-05, "loss": 0.2007, "num_input_tokens_seen": 10086624, "step": 10025 }, { "epoch": 4.728901461574729, "grad_norm": 0.4632142186164856, "learning_rate": 4.721857195145008e-05, "loss": 0.2366, "num_input_tokens_seen": 10090464, "step": 10030 }, { "epoch": 4.7312588401697315, "grad_norm": 0.628788411617279, "learning_rate": 4.721385490436563e-05, "loss": 0.1496, "num_input_tokens_seen": 10095008, "step": 10035 }, { "epoch": 4.733616218764734, "grad_norm": 0.6701692938804626, "learning_rate": 4.7209134096852717e-05, "loss": 0.1297, "num_input_tokens_seen": 10099712, "step": 10040 }, { "epoch": 4.735973597359736, "grad_norm": 2.0270609855651855, "learning_rate": 4.720440952971048e-05, "loss": 0.4051, "num_input_tokens_seen": 10104544, "step": 10045 }, { "epoch": 4.738330975954739, "grad_norm": 0.10696899145841599, "learning_rate": 4.719968120373873e-05, "loss": 0.1413, "num_input_tokens_seen": 10109472, "step": 10050 }, { "epoch": 4.740688354549741, "grad_norm": 0.04773787781596184, "learning_rate": 4.719494911973788e-05, "loss": 0.0896, "num_input_tokens_seen": 10114400, "step": 10055 }, { "epoch": 4.7430457331447435, "grad_norm": 1.3103276491165161, "learning_rate": 4.719021327850898e-05, "loss": 0.333, "num_input_tokens_seen": 10119008, "step": 10060 }, { "epoch": 4.745403111739745, "grad_norm": 0.30213987827301025, "learning_rate": 4.7185473680853753e-05, "loss": 0.1193, "num_input_tokens_seen": 10123584, "step": 10065 }, { "epoch": 4.7477604903347475, "grad_norm": 1.2033461332321167, "learning_rate": 4.718073032757451e-05, "loss": 0.1236, "num_input_tokens_seen": 10128512, "step": 10070 }, { "epoch": 4.75011786892975, "grad_norm": 1.7307264804840088, "learning_rate": 4.717598321947424e-05, "loss": 0.1472, "num_input_tokens_seen": 10133216, "step": 10075 }, { "epoch": 4.752475247524752, "grad_norm": 0.0697757750749588, "learning_rate": 4.7171232357356533e-05, "loss": 0.0712, "num_input_tokens_seen": 10140288, "step": 10080 }, { "epoch": 4.754832626119755, "grad_norm": 0.2464374154806137, "learning_rate": 4.716647774202563e-05, "loss": 0.1021, "num_input_tokens_seen": 10145632, "step": 10085 }, { "epoch": 4.757190004714757, "grad_norm": 0.7354953289031982, "learning_rate": 4.7161719374286415e-05, "loss": 0.3678, "num_input_tokens_seen": 10150720, "step": 10090 }, { "epoch": 4.7595473833097595, "grad_norm": 0.1878902018070221, "learning_rate": 4.7156957254944386e-05, "loss": 0.1873, "num_input_tokens_seen": 10155264, "step": 10095 }, { "epoch": 4.761904761904762, "grad_norm": 0.40319064259529114, "learning_rate": 4.7152191384805714e-05, "loss": 0.0628, "num_input_tokens_seen": 10159744, "step": 10100 }, { "epoch": 4.764262140499764, "grad_norm": 0.31370410323143005, "learning_rate": 4.714742176467715e-05, "loss": 0.0705, "num_input_tokens_seen": 10164256, "step": 10105 }, { "epoch": 4.766619519094767, "grad_norm": 0.4442927837371826, "learning_rate": 4.714264839536613e-05, "loss": 0.1826, "num_input_tokens_seen": 10169088, "step": 10110 }, { "epoch": 4.768976897689769, "grad_norm": 0.33469802141189575, "learning_rate": 4.713787127768069e-05, "loss": 0.0479, "num_input_tokens_seen": 10175008, "step": 10115 }, { "epoch": 4.7713342762847715, "grad_norm": 0.1385820209980011, "learning_rate": 4.7133090412429534e-05, "loss": 0.083, "num_input_tokens_seen": 10180032, "step": 10120 }, { "epoch": 4.773691654879774, "grad_norm": 2.1681675910949707, "learning_rate": 4.7128305800421965e-05, "loss": 0.3275, "num_input_tokens_seen": 10185600, "step": 10125 }, { "epoch": 4.776049033474776, "grad_norm": 1.347072958946228, "learning_rate": 4.712351744246794e-05, "loss": 0.1933, "num_input_tokens_seen": 10191296, "step": 10130 }, { "epoch": 4.778406412069779, "grad_norm": 0.4686971604824066, "learning_rate": 4.711872533937804e-05, "loss": 0.1252, "num_input_tokens_seen": 10196512, "step": 10135 }, { "epoch": 4.780763790664781, "grad_norm": 1.3340537548065186, "learning_rate": 4.711392949196351e-05, "loss": 0.2143, "num_input_tokens_seen": 10201888, "step": 10140 }, { "epoch": 4.7831211692597835, "grad_norm": 0.31158727407455444, "learning_rate": 4.7109129901036175e-05, "loss": 0.0635, "num_input_tokens_seen": 10207552, "step": 10145 }, { "epoch": 4.785478547854785, "grad_norm": 0.30713680386543274, "learning_rate": 4.7104326567408554e-05, "loss": 0.0897, "num_input_tokens_seen": 10213664, "step": 10150 }, { "epoch": 4.787835926449787, "grad_norm": 0.7845644950866699, "learning_rate": 4.709951949189374e-05, "loss": 0.1762, "num_input_tokens_seen": 10219360, "step": 10155 }, { "epoch": 4.79019330504479, "grad_norm": 2.0526578426361084, "learning_rate": 4.709470867530552e-05, "loss": 0.1059, "num_input_tokens_seen": 10224768, "step": 10160 }, { "epoch": 4.792550683639792, "grad_norm": 2.666383981704712, "learning_rate": 4.708989411845826e-05, "loss": 0.3333, "num_input_tokens_seen": 10230208, "step": 10165 }, { "epoch": 4.794908062234795, "grad_norm": 1.6053247451782227, "learning_rate": 4.708507582216699e-05, "loss": 0.2072, "num_input_tokens_seen": 10235424, "step": 10170 }, { "epoch": 4.797265440829797, "grad_norm": 1.5286614894866943, "learning_rate": 4.7080253787247365e-05, "loss": 0.3984, "num_input_tokens_seen": 10239488, "step": 10175 }, { "epoch": 4.799622819424799, "grad_norm": 0.34569716453552246, "learning_rate": 4.7075428014515675e-05, "loss": 0.111, "num_input_tokens_seen": 10244608, "step": 10180 }, { "epoch": 4.801980198019802, "grad_norm": 1.9898473024368286, "learning_rate": 4.707059850478884e-05, "loss": 0.1154, "num_input_tokens_seen": 10250240, "step": 10185 }, { "epoch": 4.804337576614804, "grad_norm": 0.765900731086731, "learning_rate": 4.706576525888442e-05, "loss": 0.0638, "num_input_tokens_seen": 10254784, "step": 10190 }, { "epoch": 4.806694955209807, "grad_norm": 0.6926279067993164, "learning_rate": 4.7060928277620604e-05, "loss": 0.124, "num_input_tokens_seen": 10260448, "step": 10195 }, { "epoch": 4.809052333804809, "grad_norm": 0.27886128425598145, "learning_rate": 4.70560875618162e-05, "loss": 0.1046, "num_input_tokens_seen": 10264352, "step": 10200 }, { "epoch": 4.811409712399811, "grad_norm": 1.4284722805023193, "learning_rate": 4.7051243112290655e-05, "loss": 0.2752, "num_input_tokens_seen": 10269952, "step": 10205 }, { "epoch": 4.813767090994814, "grad_norm": 0.38033244013786316, "learning_rate": 4.704639492986407e-05, "loss": 0.2275, "num_input_tokens_seen": 10275040, "step": 10210 }, { "epoch": 4.816124469589816, "grad_norm": 0.7782060503959656, "learning_rate": 4.704154301535714e-05, "loss": 0.131, "num_input_tokens_seen": 10280032, "step": 10215 }, { "epoch": 4.818481848184819, "grad_norm": 0.29435276985168457, "learning_rate": 4.7036687369591226e-05, "loss": 0.0805, "num_input_tokens_seen": 10285152, "step": 10220 }, { "epoch": 4.820839226779821, "grad_norm": 1.4142837524414062, "learning_rate": 4.70318279933883e-05, "loss": 0.2058, "num_input_tokens_seen": 10290144, "step": 10225 }, { "epoch": 4.823196605374823, "grad_norm": 1.8041224479675293, "learning_rate": 4.702696488757098e-05, "loss": 0.2629, "num_input_tokens_seen": 10295872, "step": 10230 }, { "epoch": 4.825553983969826, "grad_norm": 0.20068606734275818, "learning_rate": 4.7022098052962496e-05, "loss": 0.0803, "num_input_tokens_seen": 10300768, "step": 10235 }, { "epoch": 4.827911362564828, "grad_norm": 0.4959775507450104, "learning_rate": 4.701722749038673e-05, "loss": 0.1172, "num_input_tokens_seen": 10306080, "step": 10240 }, { "epoch": 4.830268741159831, "grad_norm": 0.04753889515995979, "learning_rate": 4.701235320066817e-05, "loss": 0.0926, "num_input_tokens_seen": 10311552, "step": 10245 }, { "epoch": 4.832626119754833, "grad_norm": 1.4865838289260864, "learning_rate": 4.700747518463197e-05, "loss": 0.2586, "num_input_tokens_seen": 10316736, "step": 10250 }, { "epoch": 4.834983498349835, "grad_norm": 0.21562214195728302, "learning_rate": 4.7002593443103884e-05, "loss": 0.0689, "num_input_tokens_seen": 10320960, "step": 10255 }, { "epoch": 4.837340876944838, "grad_norm": 0.19477300345897675, "learning_rate": 4.6997707976910316e-05, "loss": 0.1818, "num_input_tokens_seen": 10326336, "step": 10260 }, { "epoch": 4.839698255539839, "grad_norm": 0.0925021842122078, "learning_rate": 4.6992818786878284e-05, "loss": 0.1608, "num_input_tokens_seen": 10331136, "step": 10265 }, { "epoch": 4.842055634134842, "grad_norm": 0.6342712640762329, "learning_rate": 4.698792587383545e-05, "loss": 0.1454, "num_input_tokens_seen": 10336416, "step": 10270 }, { "epoch": 4.844413012729844, "grad_norm": 0.20957724750041962, "learning_rate": 4.6983029238610096e-05, "loss": 0.1745, "num_input_tokens_seen": 10340928, "step": 10275 }, { "epoch": 4.8467703913248465, "grad_norm": 1.235385775566101, "learning_rate": 4.697812888203115e-05, "loss": 0.1067, "num_input_tokens_seen": 10346080, "step": 10280 }, { "epoch": 4.849127769919849, "grad_norm": 0.13672098517417908, "learning_rate": 4.697322480492814e-05, "loss": 0.0567, "num_input_tokens_seen": 10350080, "step": 10285 }, { "epoch": 4.851485148514851, "grad_norm": 2.4923176765441895, "learning_rate": 4.696831700813126e-05, "loss": 0.1459, "num_input_tokens_seen": 10354880, "step": 10290 }, { "epoch": 4.853842527109854, "grad_norm": 0.032075077295303345, "learning_rate": 4.696340549247131e-05, "loss": 0.1394, "num_input_tokens_seen": 10359200, "step": 10295 }, { "epoch": 4.856199905704856, "grad_norm": 0.7291776537895203, "learning_rate": 4.695849025877973e-05, "loss": 0.0654, "num_input_tokens_seen": 10364032, "step": 10300 }, { "epoch": 4.858557284299859, "grad_norm": 0.057583462446928024, "learning_rate": 4.6953571307888587e-05, "loss": 0.1741, "num_input_tokens_seen": 10369792, "step": 10305 }, { "epoch": 4.860914662894861, "grad_norm": 1.1980509757995605, "learning_rate": 4.6948648640630565e-05, "loss": 0.1229, "num_input_tokens_seen": 10375264, "step": 10310 }, { "epoch": 4.863272041489863, "grad_norm": 0.30624920129776, "learning_rate": 4.6943722257839e-05, "loss": 0.1079, "num_input_tokens_seen": 10379712, "step": 10315 }, { "epoch": 4.865629420084866, "grad_norm": 1.1300283670425415, "learning_rate": 4.693879216034784e-05, "loss": 0.3622, "num_input_tokens_seen": 10385024, "step": 10320 }, { "epoch": 4.867986798679868, "grad_norm": 0.778092086315155, "learning_rate": 4.693385834899167e-05, "loss": 0.1201, "num_input_tokens_seen": 10390272, "step": 10325 }, { "epoch": 4.870344177274871, "grad_norm": 0.5056649446487427, "learning_rate": 4.692892082460568e-05, "loss": 0.1402, "num_input_tokens_seen": 10394368, "step": 10330 }, { "epoch": 4.872701555869873, "grad_norm": 1.849653959274292, "learning_rate": 4.692397958802574e-05, "loss": 0.2697, "num_input_tokens_seen": 10401056, "step": 10335 }, { "epoch": 4.875058934464875, "grad_norm": 0.15230581164360046, "learning_rate": 4.69190346400883e-05, "loss": 0.1029, "num_input_tokens_seen": 10405408, "step": 10340 }, { "epoch": 4.877416313059878, "grad_norm": 0.030304783955216408, "learning_rate": 4.6914085981630465e-05, "loss": 0.1353, "num_input_tokens_seen": 10410208, "step": 10345 }, { "epoch": 4.879773691654879, "grad_norm": 0.36812758445739746, "learning_rate": 4.690913361348995e-05, "loss": 0.1782, "num_input_tokens_seen": 10416000, "step": 10350 }, { "epoch": 4.882131070249882, "grad_norm": 0.49613213539123535, "learning_rate": 4.6904177536505107e-05, "loss": 0.1033, "num_input_tokens_seen": 10421120, "step": 10355 }, { "epoch": 4.884488448844884, "grad_norm": 2.8861820697784424, "learning_rate": 4.6899217751514925e-05, "loss": 0.1222, "num_input_tokens_seen": 10425376, "step": 10360 }, { "epoch": 4.8868458274398865, "grad_norm": 0.5158287882804871, "learning_rate": 4.689425425935899e-05, "loss": 0.0888, "num_input_tokens_seen": 10431392, "step": 10365 }, { "epoch": 4.889203206034889, "grad_norm": 0.9857426285743713, "learning_rate": 4.6889287060877575e-05, "loss": 0.1618, "num_input_tokens_seen": 10435968, "step": 10370 }, { "epoch": 4.891560584629891, "grad_norm": 0.28733277320861816, "learning_rate": 4.68843161569115e-05, "loss": 0.0736, "num_input_tokens_seen": 10440544, "step": 10375 }, { "epoch": 4.893917963224894, "grad_norm": 0.23441751301288605, "learning_rate": 4.687934154830229e-05, "loss": 0.0975, "num_input_tokens_seen": 10445408, "step": 10380 }, { "epoch": 4.896275341819896, "grad_norm": 0.08882312476634979, "learning_rate": 4.687436323589204e-05, "loss": 0.121, "num_input_tokens_seen": 10449888, "step": 10385 }, { "epoch": 4.8986327204148985, "grad_norm": 0.48919400572776794, "learning_rate": 4.686938122052349e-05, "loss": 0.1, "num_input_tokens_seen": 10455840, "step": 10390 }, { "epoch": 4.900990099009901, "grad_norm": 0.24927839636802673, "learning_rate": 4.686439550304003e-05, "loss": 0.1872, "num_input_tokens_seen": 10461280, "step": 10395 }, { "epoch": 4.903347477604903, "grad_norm": 0.6342592835426331, "learning_rate": 4.685940608428565e-05, "loss": 0.0696, "num_input_tokens_seen": 10467136, "step": 10400 }, { "epoch": 4.905704856199906, "grad_norm": 1.1032265424728394, "learning_rate": 4.6854412965104965e-05, "loss": 0.1428, "num_input_tokens_seen": 10472736, "step": 10405 }, { "epoch": 4.908062234794908, "grad_norm": 1.3314543962478638, "learning_rate": 4.684941614634324e-05, "loss": 0.1502, "num_input_tokens_seen": 10478112, "step": 10410 }, { "epoch": 4.9104196133899105, "grad_norm": 0.06671100854873657, "learning_rate": 4.6844415628846336e-05, "loss": 0.034, "num_input_tokens_seen": 10483456, "step": 10415 }, { "epoch": 4.912776991984913, "grad_norm": 0.6536691784858704, "learning_rate": 4.6839411413460764e-05, "loss": 0.2239, "num_input_tokens_seen": 10488704, "step": 10420 }, { "epoch": 4.915134370579915, "grad_norm": 0.40673428773880005, "learning_rate": 4.683440350103365e-05, "loss": 0.2243, "num_input_tokens_seen": 10492960, "step": 10425 }, { "epoch": 4.917491749174918, "grad_norm": 3.4042158126831055, "learning_rate": 4.6829391892412756e-05, "loss": 0.2653, "num_input_tokens_seen": 10498592, "step": 10430 }, { "epoch": 4.91984912776992, "grad_norm": 0.9606775045394897, "learning_rate": 4.682437658844645e-05, "loss": 0.1943, "num_input_tokens_seen": 10503232, "step": 10435 }, { "epoch": 4.9222065063649225, "grad_norm": 0.291923850774765, "learning_rate": 4.681935758998375e-05, "loss": 0.2076, "num_input_tokens_seen": 10508448, "step": 10440 }, { "epoch": 4.924563884959925, "grad_norm": 0.05164707452058792, "learning_rate": 4.681433489787428e-05, "loss": 0.12, "num_input_tokens_seen": 10513184, "step": 10445 }, { "epoch": 4.926921263554927, "grad_norm": 0.23904457688331604, "learning_rate": 4.68093085129683e-05, "loss": 0.1323, "num_input_tokens_seen": 10518208, "step": 10450 }, { "epoch": 4.92927864214993, "grad_norm": 0.11669597774744034, "learning_rate": 4.680427843611668e-05, "loss": 0.0836, "num_input_tokens_seen": 10522912, "step": 10455 }, { "epoch": 4.931636020744932, "grad_norm": 1.7531530857086182, "learning_rate": 4.679924466817095e-05, "loss": 0.1288, "num_input_tokens_seen": 10527200, "step": 10460 }, { "epoch": 4.933993399339934, "grad_norm": 1.8566255569458008, "learning_rate": 4.679420720998322e-05, "loss": 0.1696, "num_input_tokens_seen": 10532448, "step": 10465 }, { "epoch": 4.936350777934936, "grad_norm": 1.764092206954956, "learning_rate": 4.678916606240625e-05, "loss": 0.2527, "num_input_tokens_seen": 10537536, "step": 10470 }, { "epoch": 4.938708156529938, "grad_norm": 1.3695682287216187, "learning_rate": 4.678412122629343e-05, "loss": 0.1203, "num_input_tokens_seen": 10542784, "step": 10475 }, { "epoch": 4.941065535124941, "grad_norm": 1.6533650159835815, "learning_rate": 4.677907270249876e-05, "loss": 0.1433, "num_input_tokens_seen": 10547616, "step": 10480 }, { "epoch": 4.943422913719943, "grad_norm": 0.23132377862930298, "learning_rate": 4.677402049187687e-05, "loss": 0.2529, "num_input_tokens_seen": 10553952, "step": 10485 }, { "epoch": 4.945780292314946, "grad_norm": 0.6334948539733887, "learning_rate": 4.676896459528301e-05, "loss": 0.2194, "num_input_tokens_seen": 10560160, "step": 10490 }, { "epoch": 4.948137670909948, "grad_norm": 0.32440146803855896, "learning_rate": 4.676390501357306e-05, "loss": 0.0489, "num_input_tokens_seen": 10565760, "step": 10495 }, { "epoch": 4.9504950495049505, "grad_norm": 0.3285432755947113, "learning_rate": 4.675884174760353e-05, "loss": 0.0616, "num_input_tokens_seen": 10572256, "step": 10500 }, { "epoch": 4.952852428099953, "grad_norm": 0.3206178843975067, "learning_rate": 4.675377479823153e-05, "loss": 0.1627, "num_input_tokens_seen": 10576544, "step": 10505 }, { "epoch": 4.955209806694955, "grad_norm": 0.31731143593788147, "learning_rate": 4.674870416631481e-05, "loss": 0.1544, "num_input_tokens_seen": 10581632, "step": 10510 }, { "epoch": 4.957567185289958, "grad_norm": 0.7950935363769531, "learning_rate": 4.674362985271175e-05, "loss": 0.1515, "num_input_tokens_seen": 10586240, "step": 10515 }, { "epoch": 4.95992456388496, "grad_norm": 0.7101511359214783, "learning_rate": 4.673855185828134e-05, "loss": 0.0892, "num_input_tokens_seen": 10592000, "step": 10520 }, { "epoch": 4.9622819424799625, "grad_norm": 0.6412246227264404, "learning_rate": 4.67334701838832e-05, "loss": 0.0876, "num_input_tokens_seen": 10597056, "step": 10525 }, { "epoch": 4.964639321074965, "grad_norm": 4.097841262817383, "learning_rate": 4.6728384830377584e-05, "loss": 0.2353, "num_input_tokens_seen": 10601568, "step": 10530 }, { "epoch": 4.966996699669967, "grad_norm": 0.5393060445785522, "learning_rate": 4.6723295798625326e-05, "loss": 0.0303, "num_input_tokens_seen": 10606880, "step": 10535 }, { "epoch": 4.96935407826497, "grad_norm": 0.0393301360309124, "learning_rate": 4.671820308948793e-05, "loss": 0.1345, "num_input_tokens_seen": 10611392, "step": 10540 }, { "epoch": 4.971711456859972, "grad_norm": 0.5064797401428223, "learning_rate": 4.6713106703827514e-05, "loss": 0.1678, "num_input_tokens_seen": 10615968, "step": 10545 }, { "epoch": 4.974068835454974, "grad_norm": 0.8611802458763123, "learning_rate": 4.6708006642506804e-05, "loss": 0.12, "num_input_tokens_seen": 10621120, "step": 10550 }, { "epoch": 4.976426214049976, "grad_norm": 0.5334407091140747, "learning_rate": 4.670290290638915e-05, "loss": 0.2389, "num_input_tokens_seen": 10625504, "step": 10555 }, { "epoch": 4.978783592644978, "grad_norm": 0.9863454103469849, "learning_rate": 4.669779549633852e-05, "loss": 0.1237, "num_input_tokens_seen": 10630560, "step": 10560 }, { "epoch": 4.981140971239981, "grad_norm": 1.3992854356765747, "learning_rate": 4.669268441321952e-05, "loss": 0.1282, "num_input_tokens_seen": 10635872, "step": 10565 }, { "epoch": 4.983498349834983, "grad_norm": 0.14068850874900818, "learning_rate": 4.6687569657897367e-05, "loss": 0.2436, "num_input_tokens_seen": 10641184, "step": 10570 }, { "epoch": 4.985855728429986, "grad_norm": 1.03408682346344, "learning_rate": 4.668245123123791e-05, "loss": 0.1477, "num_input_tokens_seen": 10646304, "step": 10575 }, { "epoch": 4.988213107024988, "grad_norm": 0.7006295919418335, "learning_rate": 4.66773291341076e-05, "loss": 0.1703, "num_input_tokens_seen": 10651072, "step": 10580 }, { "epoch": 4.99057048561999, "grad_norm": 1.0453462600708008, "learning_rate": 4.667220336737354e-05, "loss": 0.1773, "num_input_tokens_seen": 10656864, "step": 10585 }, { "epoch": 4.992927864214993, "grad_norm": 0.709644615650177, "learning_rate": 4.666707393190342e-05, "loss": 0.2384, "num_input_tokens_seen": 10662112, "step": 10590 }, { "epoch": 4.995285242809995, "grad_norm": 0.2111901193857193, "learning_rate": 4.6661940828565566e-05, "loss": 0.1186, "num_input_tokens_seen": 10666560, "step": 10595 }, { "epoch": 4.997642621404998, "grad_norm": 0.10810636729001999, "learning_rate": 4.665680405822893e-05, "loss": 0.088, "num_input_tokens_seen": 10672032, "step": 10600 }, { "epoch": 5.0, "grad_norm": 0.19989317655563354, "learning_rate": 4.6651663621763076e-05, "loss": 0.1825, "num_input_tokens_seen": 10677088, "step": 10605 }, { "epoch": 5.0, "eval_loss": 0.15683941543102264, "eval_runtime": 15.1496, "eval_samples_per_second": 62.246, "eval_steps_per_second": 15.578, "num_input_tokens_seen": 10677088, "step": 10605 }, { "epoch": 5.002357378595002, "grad_norm": 0.8537131547927856, "learning_rate": 4.66465195200382e-05, "loss": 0.0611, "num_input_tokens_seen": 10682528, "step": 10610 }, { "epoch": 5.004714757190005, "grad_norm": 2.31187105178833, "learning_rate": 4.66413717539251e-05, "loss": 0.1325, "num_input_tokens_seen": 10687872, "step": 10615 }, { "epoch": 5.007072135785007, "grad_norm": 1.2313032150268555, "learning_rate": 4.6636220324295224e-05, "loss": 0.3281, "num_input_tokens_seen": 10692224, "step": 10620 }, { "epoch": 5.00942951438001, "grad_norm": 0.7299053072929382, "learning_rate": 4.663106523202059e-05, "loss": 0.1207, "num_input_tokens_seen": 10697248, "step": 10625 }, { "epoch": 5.011786892975012, "grad_norm": 0.846189558506012, "learning_rate": 4.66259064779739e-05, "loss": 0.3474, "num_input_tokens_seen": 10702496, "step": 10630 }, { "epoch": 5.014144271570014, "grad_norm": 0.9183874726295471, "learning_rate": 4.662074406302843e-05, "loss": 0.1178, "num_input_tokens_seen": 10706272, "step": 10635 }, { "epoch": 5.016501650165017, "grad_norm": 0.12331566959619522, "learning_rate": 4.6615577988058083e-05, "loss": 0.1943, "num_input_tokens_seen": 10710528, "step": 10640 }, { "epoch": 5.018859028760019, "grad_norm": 0.548247754573822, "learning_rate": 4.66104082539374e-05, "loss": 0.0978, "num_input_tokens_seen": 10715488, "step": 10645 }, { "epoch": 5.021216407355022, "grad_norm": 0.36443963646888733, "learning_rate": 4.660523486154152e-05, "loss": 0.0623, "num_input_tokens_seen": 10720576, "step": 10650 }, { "epoch": 5.023573785950024, "grad_norm": 1.2999712228775024, "learning_rate": 4.6600057811746216e-05, "loss": 0.1521, "num_input_tokens_seen": 10726304, "step": 10655 }, { "epoch": 5.0259311645450255, "grad_norm": 0.268777996301651, "learning_rate": 4.659487710542787e-05, "loss": 0.0508, "num_input_tokens_seen": 10732576, "step": 10660 }, { "epoch": 5.028288543140028, "grad_norm": 1.3247787952423096, "learning_rate": 4.6589692743463495e-05, "loss": 0.2284, "num_input_tokens_seen": 10737632, "step": 10665 }, { "epoch": 5.03064592173503, "grad_norm": 1.1598708629608154, "learning_rate": 4.6584504726730706e-05, "loss": 0.1342, "num_input_tokens_seen": 10741856, "step": 10670 }, { "epoch": 5.033003300330033, "grad_norm": 0.2421996295452118, "learning_rate": 4.657931305610775e-05, "loss": 0.1037, "num_input_tokens_seen": 10746208, "step": 10675 }, { "epoch": 5.035360678925035, "grad_norm": 0.24647459387779236, "learning_rate": 4.65741177324735e-05, "loss": 0.3268, "num_input_tokens_seen": 10750944, "step": 10680 }, { "epoch": 5.0377180575200375, "grad_norm": 1.143604040145874, "learning_rate": 4.656891875670742e-05, "loss": 0.2841, "num_input_tokens_seen": 10755200, "step": 10685 }, { "epoch": 5.04007543611504, "grad_norm": 0.2808510363101959, "learning_rate": 4.656371612968961e-05, "loss": 0.096, "num_input_tokens_seen": 10759392, "step": 10690 }, { "epoch": 5.042432814710042, "grad_norm": 1.3598082065582275, "learning_rate": 4.6558509852300784e-05, "loss": 0.1225, "num_input_tokens_seen": 10765216, "step": 10695 }, { "epoch": 5.044790193305045, "grad_norm": 0.3951672911643982, "learning_rate": 4.6553299925422287e-05, "loss": 0.209, "num_input_tokens_seen": 10771104, "step": 10700 }, { "epoch": 5.047147571900047, "grad_norm": 1.585639238357544, "learning_rate": 4.6548086349936074e-05, "loss": 0.1287, "num_input_tokens_seen": 10776064, "step": 10705 }, { "epoch": 5.0495049504950495, "grad_norm": 0.2850760519504547, "learning_rate": 4.654286912672471e-05, "loss": 0.0927, "num_input_tokens_seen": 10780960, "step": 10710 }, { "epoch": 5.051862329090052, "grad_norm": 0.1201963871717453, "learning_rate": 4.653764825667137e-05, "loss": 0.1146, "num_input_tokens_seen": 10785760, "step": 10715 }, { "epoch": 5.054219707685054, "grad_norm": 0.6280065178871155, "learning_rate": 4.653242374065987e-05, "loss": 0.2415, "num_input_tokens_seen": 10790400, "step": 10720 }, { "epoch": 5.056577086280057, "grad_norm": 1.2999396324157715, "learning_rate": 4.6527195579574634e-05, "loss": 0.1523, "num_input_tokens_seen": 10795200, "step": 10725 }, { "epoch": 5.058934464875059, "grad_norm": 1.271591305732727, "learning_rate": 4.65219637743007e-05, "loss": 0.1656, "num_input_tokens_seen": 10800000, "step": 10730 }, { "epoch": 5.061291843470062, "grad_norm": 0.9434502720832825, "learning_rate": 4.651672832572371e-05, "loss": 0.1258, "num_input_tokens_seen": 10804000, "step": 10735 }, { "epoch": 5.063649222065064, "grad_norm": 0.2201591581106186, "learning_rate": 4.651148923472995e-05, "loss": 0.1235, "num_input_tokens_seen": 10808896, "step": 10740 }, { "epoch": 5.066006600660066, "grad_norm": 0.6406424641609192, "learning_rate": 4.650624650220632e-05, "loss": 0.1118, "num_input_tokens_seen": 10813664, "step": 10745 }, { "epoch": 5.068363979255069, "grad_norm": 0.07971697300672531, "learning_rate": 4.650100012904031e-05, "loss": 0.055, "num_input_tokens_seen": 10819264, "step": 10750 }, { "epoch": 5.07072135785007, "grad_norm": 0.2896900177001953, "learning_rate": 4.649575011612004e-05, "loss": 0.0912, "num_input_tokens_seen": 10824064, "step": 10755 }, { "epoch": 5.073078736445073, "grad_norm": 0.8651484251022339, "learning_rate": 4.6490496464334247e-05, "loss": 0.1165, "num_input_tokens_seen": 10829216, "step": 10760 }, { "epoch": 5.075436115040075, "grad_norm": 2.8779261112213135, "learning_rate": 4.648523917457229e-05, "loss": 0.2117, "num_input_tokens_seen": 10835264, "step": 10765 }, { "epoch": 5.0777934936350775, "grad_norm": 0.22148101031780243, "learning_rate": 4.6479978247724145e-05, "loss": 0.0862, "num_input_tokens_seen": 10840384, "step": 10770 }, { "epoch": 5.08015087223008, "grad_norm": 0.800041139125824, "learning_rate": 4.6474713684680394e-05, "loss": 0.124, "num_input_tokens_seen": 10845632, "step": 10775 }, { "epoch": 5.082508250825082, "grad_norm": 1.3512694835662842, "learning_rate": 4.646944548633223e-05, "loss": 0.2148, "num_input_tokens_seen": 10850560, "step": 10780 }, { "epoch": 5.084865629420085, "grad_norm": 0.7003898024559021, "learning_rate": 4.646417365357148e-05, "loss": 0.114, "num_input_tokens_seen": 10855360, "step": 10785 }, { "epoch": 5.087223008015087, "grad_norm": 0.8825246691703796, "learning_rate": 4.645889818729057e-05, "loss": 0.1399, "num_input_tokens_seen": 10862336, "step": 10790 }, { "epoch": 5.0895803866100895, "grad_norm": 1.921477198600769, "learning_rate": 4.645361908838255e-05, "loss": 0.1558, "num_input_tokens_seen": 10867264, "step": 10795 }, { "epoch": 5.091937765205092, "grad_norm": 1.0740636587142944, "learning_rate": 4.644833635774107e-05, "loss": 0.1549, "num_input_tokens_seen": 10872384, "step": 10800 }, { "epoch": 5.094295143800094, "grad_norm": 0.11709905415773392, "learning_rate": 4.644304999626044e-05, "loss": 0.0768, "num_input_tokens_seen": 10877760, "step": 10805 }, { "epoch": 5.096652522395097, "grad_norm": 0.24041831493377686, "learning_rate": 4.643776000483551e-05, "loss": 0.0818, "num_input_tokens_seen": 10882816, "step": 10810 }, { "epoch": 5.099009900990099, "grad_norm": 0.12956281006336212, "learning_rate": 4.64324663843618e-05, "loss": 0.0421, "num_input_tokens_seen": 10887488, "step": 10815 }, { "epoch": 5.1013672795851015, "grad_norm": 0.17207388579845428, "learning_rate": 4.642716913573544e-05, "loss": 0.0946, "num_input_tokens_seen": 10892096, "step": 10820 }, { "epoch": 5.103724658180104, "grad_norm": 0.2495039403438568, "learning_rate": 4.6421868259853156e-05, "loss": 0.1942, "num_input_tokens_seen": 10897952, "step": 10825 }, { "epoch": 5.106082036775106, "grad_norm": 0.3328884243965149, "learning_rate": 4.64165637576123e-05, "loss": 0.1521, "num_input_tokens_seen": 10903264, "step": 10830 }, { "epoch": 5.108439415370109, "grad_norm": 1.1019275188446045, "learning_rate": 4.6411255629910835e-05, "loss": 0.1201, "num_input_tokens_seen": 10908704, "step": 10835 }, { "epoch": 5.110796793965111, "grad_norm": 0.6458044648170471, "learning_rate": 4.640594387764733e-05, "loss": 0.1751, "num_input_tokens_seen": 10912928, "step": 10840 }, { "epoch": 5.1131541725601135, "grad_norm": 0.8266037702560425, "learning_rate": 4.640062850172098e-05, "loss": 0.0582, "num_input_tokens_seen": 10918560, "step": 10845 }, { "epoch": 5.115511551155116, "grad_norm": 1.508684754371643, "learning_rate": 4.6395309503031594e-05, "loss": 0.1819, "num_input_tokens_seen": 10922656, "step": 10850 }, { "epoch": 5.117868929750118, "grad_norm": 0.12768618762493134, "learning_rate": 4.638998688247957e-05, "loss": 0.056, "num_input_tokens_seen": 10927232, "step": 10855 }, { "epoch": 5.12022630834512, "grad_norm": 1.3797861337661743, "learning_rate": 4.6384660640965946e-05, "loss": 0.2517, "num_input_tokens_seen": 10932832, "step": 10860 }, { "epoch": 5.122583686940122, "grad_norm": 0.7216066718101501, "learning_rate": 4.637933077939238e-05, "loss": 0.1627, "num_input_tokens_seen": 10937888, "step": 10865 }, { "epoch": 5.124941065535125, "grad_norm": 0.25103312730789185, "learning_rate": 4.6373997298661105e-05, "loss": 0.1153, "num_input_tokens_seen": 10942720, "step": 10870 }, { "epoch": 5.127298444130127, "grad_norm": 0.980595052242279, "learning_rate": 4.636866019967499e-05, "loss": 0.1888, "num_input_tokens_seen": 10948288, "step": 10875 }, { "epoch": 5.129655822725129, "grad_norm": 0.23183315992355347, "learning_rate": 4.6363319483337534e-05, "loss": 0.1346, "num_input_tokens_seen": 10953152, "step": 10880 }, { "epoch": 5.132013201320132, "grad_norm": 1.4558547735214233, "learning_rate": 4.635797515055282e-05, "loss": 0.1022, "num_input_tokens_seen": 10958368, "step": 10885 }, { "epoch": 5.134370579915134, "grad_norm": 1.348642110824585, "learning_rate": 4.635262720222554e-05, "loss": 0.25, "num_input_tokens_seen": 10963968, "step": 10890 }, { "epoch": 5.136727958510137, "grad_norm": 0.12370149046182632, "learning_rate": 4.634727563926103e-05, "loss": 0.167, "num_input_tokens_seen": 10968288, "step": 10895 }, { "epoch": 5.139085337105139, "grad_norm": 0.021181942895054817, "learning_rate": 4.6341920462565206e-05, "loss": 0.1596, "num_input_tokens_seen": 10973888, "step": 10900 }, { "epoch": 5.141442715700141, "grad_norm": 0.6945145726203918, "learning_rate": 4.6336561673044614e-05, "loss": 0.0548, "num_input_tokens_seen": 10981184, "step": 10905 }, { "epoch": 5.143800094295144, "grad_norm": 0.8096641898155212, "learning_rate": 4.6331199271606405e-05, "loss": 0.1813, "num_input_tokens_seen": 10986304, "step": 10910 }, { "epoch": 5.146157472890146, "grad_norm": 0.07081890851259232, "learning_rate": 4.632583325915835e-05, "loss": 0.0719, "num_input_tokens_seen": 10993312, "step": 10915 }, { "epoch": 5.148514851485149, "grad_norm": 1.3015133142471313, "learning_rate": 4.6320463636608804e-05, "loss": 0.1945, "num_input_tokens_seen": 10997120, "step": 10920 }, { "epoch": 5.150872230080151, "grad_norm": 0.013222269713878632, "learning_rate": 4.6315090404866766e-05, "loss": 0.0781, "num_input_tokens_seen": 11002592, "step": 10925 }, { "epoch": 5.1532296086751535, "grad_norm": 1.3394936323165894, "learning_rate": 4.630971356484184e-05, "loss": 0.2359, "num_input_tokens_seen": 11007040, "step": 10930 }, { "epoch": 5.155586987270156, "grad_norm": 2.033195972442627, "learning_rate": 4.630433311744422e-05, "loss": 0.2308, "num_input_tokens_seen": 11011776, "step": 10935 }, { "epoch": 5.157944365865158, "grad_norm": 1.4846880435943604, "learning_rate": 4.6298949063584736e-05, "loss": 0.153, "num_input_tokens_seen": 11015840, "step": 10940 }, { "epoch": 5.160301744460161, "grad_norm": 0.47351518273353577, "learning_rate": 4.62935614041748e-05, "loss": 0.1128, "num_input_tokens_seen": 11021088, "step": 10945 }, { "epoch": 5.162659123055163, "grad_norm": 0.24860882759094238, "learning_rate": 4.6288170140126476e-05, "loss": 0.1626, "num_input_tokens_seen": 11026208, "step": 10950 }, { "epoch": 5.165016501650165, "grad_norm": 0.1944998949766159, "learning_rate": 4.6282775272352394e-05, "loss": 0.0525, "num_input_tokens_seen": 11031104, "step": 10955 }, { "epoch": 5.167373880245167, "grad_norm": 0.4810832738876343, "learning_rate": 4.627737680176581e-05, "loss": 0.0547, "num_input_tokens_seen": 11036032, "step": 10960 }, { "epoch": 5.169731258840169, "grad_norm": 0.30216968059539795, "learning_rate": 4.627197472928062e-05, "loss": 0.0304, "num_input_tokens_seen": 11040544, "step": 10965 }, { "epoch": 5.172088637435172, "grad_norm": 0.9871057271957397, "learning_rate": 4.6266569055811276e-05, "loss": 0.2707, "num_input_tokens_seen": 11046592, "step": 10970 }, { "epoch": 5.174446016030174, "grad_norm": 0.3187277019023895, "learning_rate": 4.626115978227288e-05, "loss": 0.0948, "num_input_tokens_seen": 11051424, "step": 10975 }, { "epoch": 5.176803394625177, "grad_norm": 0.07667816430330276, "learning_rate": 4.6255746909581124e-05, "loss": 0.121, "num_input_tokens_seen": 11056992, "step": 10980 }, { "epoch": 5.179160773220179, "grad_norm": 0.5391067266464233, "learning_rate": 4.625033043865232e-05, "loss": 0.1895, "num_input_tokens_seen": 11063520, "step": 10985 }, { "epoch": 5.181518151815181, "grad_norm": 0.3581492602825165, "learning_rate": 4.6244910370403383e-05, "loss": 0.242, "num_input_tokens_seen": 11068256, "step": 10990 }, { "epoch": 5.183875530410184, "grad_norm": 0.6542142629623413, "learning_rate": 4.6239486705751834e-05, "loss": 0.1875, "num_input_tokens_seen": 11074048, "step": 10995 }, { "epoch": 5.186232909005186, "grad_norm": 0.7289865016937256, "learning_rate": 4.6234059445615813e-05, "loss": 0.1498, "num_input_tokens_seen": 11078560, "step": 11000 }, { "epoch": 5.188590287600189, "grad_norm": 0.14359885454177856, "learning_rate": 4.622862859091407e-05, "loss": 0.0478, "num_input_tokens_seen": 11083488, "step": 11005 }, { "epoch": 5.190947666195191, "grad_norm": 1.1671756505966187, "learning_rate": 4.6223194142565937e-05, "loss": 0.0917, "num_input_tokens_seen": 11087872, "step": 11010 }, { "epoch": 5.193305044790193, "grad_norm": 0.5512914657592773, "learning_rate": 4.6217756101491396e-05, "loss": 0.0801, "num_input_tokens_seen": 11094144, "step": 11015 }, { "epoch": 5.195662423385196, "grad_norm": 0.03139237314462662, "learning_rate": 4.621231446861099e-05, "loss": 0.137, "num_input_tokens_seen": 11098976, "step": 11020 }, { "epoch": 5.198019801980198, "grad_norm": 0.7948375344276428, "learning_rate": 4.620686924484592e-05, "loss": 0.1522, "num_input_tokens_seen": 11104960, "step": 11025 }, { "epoch": 5.200377180575201, "grad_norm": 0.6930983662605286, "learning_rate": 4.620142043111795e-05, "loss": 0.2599, "num_input_tokens_seen": 11110048, "step": 11030 }, { "epoch": 5.202734559170203, "grad_norm": 1.3300224542617798, "learning_rate": 4.619596802834949e-05, "loss": 0.2245, "num_input_tokens_seen": 11115712, "step": 11035 }, { "epoch": 5.205091937765205, "grad_norm": 0.6522700190544128, "learning_rate": 4.619051203746353e-05, "loss": 0.2114, "num_input_tokens_seen": 11121344, "step": 11040 }, { "epoch": 5.207449316360208, "grad_norm": 0.468391478061676, "learning_rate": 4.618505245938367e-05, "loss": 0.0873, "num_input_tokens_seen": 11127232, "step": 11045 }, { "epoch": 5.20980669495521, "grad_norm": 1.0925579071044922, "learning_rate": 4.617958929503413e-05, "loss": 0.2123, "num_input_tokens_seen": 11132832, "step": 11050 }, { "epoch": 5.212164073550213, "grad_norm": 0.24863742291927338, "learning_rate": 4.6174122545339736e-05, "loss": 0.2119, "num_input_tokens_seen": 11138752, "step": 11055 }, { "epoch": 5.214521452145214, "grad_norm": 0.7672008275985718, "learning_rate": 4.616865221122591e-05, "loss": 0.1534, "num_input_tokens_seen": 11143904, "step": 11060 }, { "epoch": 5.2168788307402165, "grad_norm": 0.24848920106887817, "learning_rate": 4.616317829361869e-05, "loss": 0.0826, "num_input_tokens_seen": 11148960, "step": 11065 }, { "epoch": 5.219236209335219, "grad_norm": 0.28142306208610535, "learning_rate": 4.615770079344472e-05, "loss": 0.0768, "num_input_tokens_seen": 11154816, "step": 11070 }, { "epoch": 5.221593587930221, "grad_norm": 0.4994557797908783, "learning_rate": 4.615221971163123e-05, "loss": 0.1898, "num_input_tokens_seen": 11159072, "step": 11075 }, { "epoch": 5.223950966525224, "grad_norm": 0.14586272835731506, "learning_rate": 4.614673504910609e-05, "loss": 0.0478, "num_input_tokens_seen": 11162880, "step": 11080 }, { "epoch": 5.226308345120226, "grad_norm": 0.41448554396629333, "learning_rate": 4.6141246806797754e-05, "loss": 0.0942, "num_input_tokens_seen": 11167264, "step": 11085 }, { "epoch": 5.2286657237152285, "grad_norm": 0.9132392406463623, "learning_rate": 4.613575498563531e-05, "loss": 0.0648, "num_input_tokens_seen": 11171616, "step": 11090 }, { "epoch": 5.231023102310231, "grad_norm": 0.6120309233665466, "learning_rate": 4.613025958654839e-05, "loss": 0.1444, "num_input_tokens_seen": 11176992, "step": 11095 }, { "epoch": 5.233380480905233, "grad_norm": 0.2718588709831238, "learning_rate": 4.6124760610467304e-05, "loss": 0.0926, "num_input_tokens_seen": 11181248, "step": 11100 }, { "epoch": 5.235737859500236, "grad_norm": 1.2956113815307617, "learning_rate": 4.6119258058322924e-05, "loss": 0.2565, "num_input_tokens_seen": 11187232, "step": 11105 }, { "epoch": 5.238095238095238, "grad_norm": 0.4197085499763489, "learning_rate": 4.611375193104674e-05, "loss": 0.067, "num_input_tokens_seen": 11193024, "step": 11110 }, { "epoch": 5.2404526166902405, "grad_norm": 1.648444652557373, "learning_rate": 4.610824222957084e-05, "loss": 0.095, "num_input_tokens_seen": 11198432, "step": 11115 }, { "epoch": 5.242809995285243, "grad_norm": 0.26679739356040955, "learning_rate": 4.610272895482794e-05, "loss": 0.1786, "num_input_tokens_seen": 11202976, "step": 11120 }, { "epoch": 5.245167373880245, "grad_norm": 0.4951089322566986, "learning_rate": 4.609721210775132e-05, "loss": 0.0897, "num_input_tokens_seen": 11207488, "step": 11125 }, { "epoch": 5.247524752475248, "grad_norm": 0.704301118850708, "learning_rate": 4.609169168927491e-05, "loss": 0.1996, "num_input_tokens_seen": 11213344, "step": 11130 }, { "epoch": 5.24988213107025, "grad_norm": 3.1971678733825684, "learning_rate": 4.60861677003332e-05, "loss": 0.1571, "num_input_tokens_seen": 11217024, "step": 11135 }, { "epoch": 5.2522395096652525, "grad_norm": 0.17967469990253448, "learning_rate": 4.608064014186133e-05, "loss": 0.0638, "num_input_tokens_seen": 11223008, "step": 11140 }, { "epoch": 5.254596888260255, "grad_norm": 0.6947186589241028, "learning_rate": 4.6075109014795016e-05, "loss": 0.0549, "num_input_tokens_seen": 11227712, "step": 11145 }, { "epoch": 5.256954266855257, "grad_norm": 0.6079621911048889, "learning_rate": 4.6069574320070575e-05, "loss": 0.0561, "num_input_tokens_seen": 11233536, "step": 11150 }, { "epoch": 5.259311645450259, "grad_norm": 1.829858422279358, "learning_rate": 4.6064036058624936e-05, "loss": 0.289, "num_input_tokens_seen": 11239680, "step": 11155 }, { "epoch": 5.261669024045261, "grad_norm": 2.031153917312622, "learning_rate": 4.6058494231395655e-05, "loss": 0.1295, "num_input_tokens_seen": 11244544, "step": 11160 }, { "epoch": 5.264026402640264, "grad_norm": 2.3303568363189697, "learning_rate": 4.605294883932084e-05, "loss": 0.3552, "num_input_tokens_seen": 11249120, "step": 11165 }, { "epoch": 5.266383781235266, "grad_norm": 0.4812454581260681, "learning_rate": 4.604739988333925e-05, "loss": 0.2479, "num_input_tokens_seen": 11254976, "step": 11170 }, { "epoch": 5.2687411598302685, "grad_norm": 2.3967347145080566, "learning_rate": 4.604184736439023e-05, "loss": 0.2479, "num_input_tokens_seen": 11258816, "step": 11175 }, { "epoch": 5.271098538425271, "grad_norm": 0.7251073718070984, "learning_rate": 4.6036291283413714e-05, "loss": 0.2055, "num_input_tokens_seen": 11263520, "step": 11180 }, { "epoch": 5.273455917020273, "grad_norm": 0.8027268648147583, "learning_rate": 4.6030731641350265e-05, "loss": 0.2362, "num_input_tokens_seen": 11268512, "step": 11185 }, { "epoch": 5.275813295615276, "grad_norm": 1.5485020875930786, "learning_rate": 4.602516843914103e-05, "loss": 0.1905, "num_input_tokens_seen": 11272960, "step": 11190 }, { "epoch": 5.278170674210278, "grad_norm": 0.2774328291416168, "learning_rate": 4.601960167772776e-05, "loss": 0.0226, "num_input_tokens_seen": 11276992, "step": 11195 }, { "epoch": 5.2805280528052805, "grad_norm": 0.06841090321540833, "learning_rate": 4.601403135805282e-05, "loss": 0.2681, "num_input_tokens_seen": 11281568, "step": 11200 }, { "epoch": 5.282885431400283, "grad_norm": 1.408042550086975, "learning_rate": 4.6008457481059177e-05, "loss": 0.0947, "num_input_tokens_seen": 11285952, "step": 11205 }, { "epoch": 5.285242809995285, "grad_norm": 0.1764611303806305, "learning_rate": 4.600288004769038e-05, "loss": 0.1465, "num_input_tokens_seen": 11291552, "step": 11210 }, { "epoch": 5.287600188590288, "grad_norm": 0.14456653594970703, "learning_rate": 4.5997299058890594e-05, "loss": 0.065, "num_input_tokens_seen": 11296576, "step": 11215 }, { "epoch": 5.28995756718529, "grad_norm": 0.2803828716278076, "learning_rate": 4.59917145156046e-05, "loss": 0.0951, "num_input_tokens_seen": 11302240, "step": 11220 }, { "epoch": 5.2923149457802925, "grad_norm": 0.30400124192237854, "learning_rate": 4.598612641877775e-05, "loss": 0.136, "num_input_tokens_seen": 11307232, "step": 11225 }, { "epoch": 5.294672324375295, "grad_norm": 2.384932279586792, "learning_rate": 4.5980534769356026e-05, "loss": 0.1246, "num_input_tokens_seen": 11312160, "step": 11230 }, { "epoch": 5.297029702970297, "grad_norm": 0.6485844850540161, "learning_rate": 4.5974939568286e-05, "loss": 0.1154, "num_input_tokens_seen": 11316960, "step": 11235 }, { "epoch": 5.2993870815653, "grad_norm": 0.11977234482765198, "learning_rate": 4.596934081651483e-05, "loss": 0.1382, "num_input_tokens_seen": 11323232, "step": 11240 }, { "epoch": 5.301744460160302, "grad_norm": 1.3672133684158325, "learning_rate": 4.59637385149903e-05, "loss": 0.1139, "num_input_tokens_seen": 11327520, "step": 11245 }, { "epoch": 5.3041018387553045, "grad_norm": 0.06185010448098183, "learning_rate": 4.595813266466079e-05, "loss": 0.0561, "num_input_tokens_seen": 11333536, "step": 11250 }, { "epoch": 5.306459217350307, "grad_norm": 0.483965665102005, "learning_rate": 4.5952523266475265e-05, "loss": 0.0417, "num_input_tokens_seen": 11339072, "step": 11255 }, { "epoch": 5.308816595945308, "grad_norm": 0.4311487078666687, "learning_rate": 4.5946910321383294e-05, "loss": 0.1459, "num_input_tokens_seen": 11343520, "step": 11260 }, { "epoch": 5.311173974540311, "grad_norm": 1.1620714664459229, "learning_rate": 4.594129383033508e-05, "loss": 0.1259, "num_input_tokens_seen": 11348000, "step": 11265 }, { "epoch": 5.313531353135313, "grad_norm": 0.17637138068675995, "learning_rate": 4.5935673794281376e-05, "loss": 0.1222, "num_input_tokens_seen": 11352928, "step": 11270 }, { "epoch": 5.315888731730316, "grad_norm": 0.022205254063010216, "learning_rate": 4.593005021417357e-05, "loss": 0.07, "num_input_tokens_seen": 11358528, "step": 11275 }, { "epoch": 5.318246110325318, "grad_norm": 0.9475031495094299, "learning_rate": 4.5924423090963634e-05, "loss": 0.321, "num_input_tokens_seen": 11362496, "step": 11280 }, { "epoch": 5.32060348892032, "grad_norm": 0.6686188578605652, "learning_rate": 4.591879242560414e-05, "loss": 0.1844, "num_input_tokens_seen": 11366304, "step": 11285 }, { "epoch": 5.322960867515323, "grad_norm": 0.1253214329481125, "learning_rate": 4.591315821904827e-05, "loss": 0.1584, "num_input_tokens_seen": 11370848, "step": 11290 }, { "epoch": 5.325318246110325, "grad_norm": 1.6318687200546265, "learning_rate": 4.5907520472249805e-05, "loss": 0.1583, "num_input_tokens_seen": 11375584, "step": 11295 }, { "epoch": 5.327675624705328, "grad_norm": 1.213168740272522, "learning_rate": 4.590187918616311e-05, "loss": 0.0927, "num_input_tokens_seen": 11380768, "step": 11300 }, { "epoch": 5.33003300330033, "grad_norm": 1.6478568315505981, "learning_rate": 4.589623436174316e-05, "loss": 0.2429, "num_input_tokens_seen": 11386720, "step": 11305 }, { "epoch": 5.332390381895332, "grad_norm": 0.4568209946155548, "learning_rate": 4.589058599994553e-05, "loss": 0.0628, "num_input_tokens_seen": 11392128, "step": 11310 }, { "epoch": 5.334747760490335, "grad_norm": 0.6499640345573425, "learning_rate": 4.58849341017264e-05, "loss": 0.0651, "num_input_tokens_seen": 11398144, "step": 11315 }, { "epoch": 5.337105139085337, "grad_norm": 0.19519159197807312, "learning_rate": 4.5879278668042525e-05, "loss": 0.1349, "num_input_tokens_seen": 11402656, "step": 11320 }, { "epoch": 5.33946251768034, "grad_norm": 0.47827038168907166, "learning_rate": 4.587361969985128e-05, "loss": 0.1087, "num_input_tokens_seen": 11407904, "step": 11325 }, { "epoch": 5.341819896275342, "grad_norm": 1.7072802782058716, "learning_rate": 4.586795719811064e-05, "loss": 0.2174, "num_input_tokens_seen": 11413696, "step": 11330 }, { "epoch": 5.344177274870344, "grad_norm": 0.10635650157928467, "learning_rate": 4.5862291163779165e-05, "loss": 0.0491, "num_input_tokens_seen": 11419200, "step": 11335 }, { "epoch": 5.346534653465347, "grad_norm": 1.0107989311218262, "learning_rate": 4.5856621597816015e-05, "loss": 0.1787, "num_input_tokens_seen": 11423744, "step": 11340 }, { "epoch": 5.348892032060349, "grad_norm": 0.32785525918006897, "learning_rate": 4.5850948501180955e-05, "loss": 0.1153, "num_input_tokens_seen": 11428384, "step": 11345 }, { "epoch": 5.351249410655352, "grad_norm": 1.5561591386795044, "learning_rate": 4.584527187483434e-05, "loss": 0.0868, "num_input_tokens_seen": 11432256, "step": 11350 }, { "epoch": 5.353606789250353, "grad_norm": 1.7000492811203003, "learning_rate": 4.583959171973713e-05, "loss": 0.1607, "num_input_tokens_seen": 11437024, "step": 11355 }, { "epoch": 5.355964167845356, "grad_norm": 0.8720440864562988, "learning_rate": 4.583390803685088e-05, "loss": 0.2216, "num_input_tokens_seen": 11443520, "step": 11360 }, { "epoch": 5.358321546440358, "grad_norm": 0.05695750564336777, "learning_rate": 4.582822082713773e-05, "loss": 0.0463, "num_input_tokens_seen": 11448000, "step": 11365 }, { "epoch": 5.36067892503536, "grad_norm": 0.7074188590049744, "learning_rate": 4.5822530091560446e-05, "loss": 0.0342, "num_input_tokens_seen": 11451648, "step": 11370 }, { "epoch": 5.363036303630363, "grad_norm": 1.1985077857971191, "learning_rate": 4.581683583108236e-05, "loss": 0.1428, "num_input_tokens_seen": 11457600, "step": 11375 }, { "epoch": 5.365393682225365, "grad_norm": 0.6459208726882935, "learning_rate": 4.5811138046667426e-05, "loss": 0.2321, "num_input_tokens_seen": 11461920, "step": 11380 }, { "epoch": 5.367751060820368, "grad_norm": 0.17550723254680634, "learning_rate": 4.5805436739280164e-05, "loss": 0.0471, "num_input_tokens_seen": 11466880, "step": 11385 }, { "epoch": 5.37010843941537, "grad_norm": 0.12540201842784882, "learning_rate": 4.579973190988572e-05, "loss": 0.2042, "num_input_tokens_seen": 11472512, "step": 11390 }, { "epoch": 5.372465818010372, "grad_norm": 0.048246048390865326, "learning_rate": 4.5794023559449824e-05, "loss": 0.0697, "num_input_tokens_seen": 11477920, "step": 11395 }, { "epoch": 5.374823196605375, "grad_norm": 0.409005731344223, "learning_rate": 4.57883116889388e-05, "loss": 0.1281, "num_input_tokens_seen": 11482656, "step": 11400 }, { "epoch": 5.377180575200377, "grad_norm": 1.7194066047668457, "learning_rate": 4.578259629931958e-05, "loss": 0.0936, "num_input_tokens_seen": 11488704, "step": 11405 }, { "epoch": 5.37953795379538, "grad_norm": 1.6022876501083374, "learning_rate": 4.577687739155967e-05, "loss": 0.2551, "num_input_tokens_seen": 11493504, "step": 11410 }, { "epoch": 5.381895332390382, "grad_norm": 1.3578393459320068, "learning_rate": 4.577115496662718e-05, "loss": 0.1474, "num_input_tokens_seen": 11499296, "step": 11415 }, { "epoch": 5.384252710985384, "grad_norm": 0.30118295550346375, "learning_rate": 4.576542902549084e-05, "loss": 0.1625, "num_input_tokens_seen": 11503360, "step": 11420 }, { "epoch": 5.386610089580387, "grad_norm": 0.2095346599817276, "learning_rate": 4.575969956911994e-05, "loss": 0.0236, "num_input_tokens_seen": 11507200, "step": 11425 }, { "epoch": 5.388967468175389, "grad_norm": 1.1229485273361206, "learning_rate": 4.575396659848438e-05, "loss": 0.1019, "num_input_tokens_seen": 11511520, "step": 11430 }, { "epoch": 5.391324846770392, "grad_norm": 0.7656957507133484, "learning_rate": 4.5748230114554666e-05, "loss": 0.0761, "num_input_tokens_seen": 11516096, "step": 11435 }, { "epoch": 5.393682225365394, "grad_norm": 1.7777378559112549, "learning_rate": 4.574249011830187e-05, "loss": 0.1303, "num_input_tokens_seen": 11521632, "step": 11440 }, { "epoch": 5.396039603960396, "grad_norm": 0.6161177754402161, "learning_rate": 4.573674661069768e-05, "loss": 0.0959, "num_input_tokens_seen": 11527744, "step": 11445 }, { "epoch": 5.398396982555399, "grad_norm": 0.8061928749084473, "learning_rate": 4.573099959271439e-05, "loss": 0.1259, "num_input_tokens_seen": 11531744, "step": 11450 }, { "epoch": 5.400754361150401, "grad_norm": 1.1848015785217285, "learning_rate": 4.572524906532486e-05, "loss": 0.1529, "num_input_tokens_seen": 11536192, "step": 11455 }, { "epoch": 5.403111739745403, "grad_norm": 0.6837301254272461, "learning_rate": 4.571949502950255e-05, "loss": 0.1478, "num_input_tokens_seen": 11541152, "step": 11460 }, { "epoch": 5.405469118340405, "grad_norm": 1.1213592290878296, "learning_rate": 4.5713737486221534e-05, "loss": 0.2982, "num_input_tokens_seen": 11546560, "step": 11465 }, { "epoch": 5.4078264969354075, "grad_norm": 0.14333784580230713, "learning_rate": 4.570797643645646e-05, "loss": 0.0528, "num_input_tokens_seen": 11552000, "step": 11470 }, { "epoch": 5.41018387553041, "grad_norm": 0.3460712730884552, "learning_rate": 4.570221188118258e-05, "loss": 0.0628, "num_input_tokens_seen": 11557088, "step": 11475 }, { "epoch": 5.412541254125412, "grad_norm": 0.7266907691955566, "learning_rate": 4.5696443821375726e-05, "loss": 0.1355, "num_input_tokens_seen": 11562336, "step": 11480 }, { "epoch": 5.414898632720415, "grad_norm": 0.01604308746755123, "learning_rate": 4.569067225801234e-05, "loss": 0.0623, "num_input_tokens_seen": 11567296, "step": 11485 }, { "epoch": 5.417256011315417, "grad_norm": 0.11372414231300354, "learning_rate": 4.568489719206945e-05, "loss": 0.0649, "num_input_tokens_seen": 11572672, "step": 11490 }, { "epoch": 5.4196133899104195, "grad_norm": 0.19988782703876495, "learning_rate": 4.5679118624524673e-05, "loss": 0.1322, "num_input_tokens_seen": 11577312, "step": 11495 }, { "epoch": 5.421970768505422, "grad_norm": 0.6815755367279053, "learning_rate": 4.567333655635623e-05, "loss": 0.0688, "num_input_tokens_seen": 11582208, "step": 11500 }, { "epoch": 5.424328147100424, "grad_norm": 0.9867069721221924, "learning_rate": 4.566755098854291e-05, "loss": 0.0776, "num_input_tokens_seen": 11587424, "step": 11505 }, { "epoch": 5.426685525695427, "grad_norm": 0.6698878407478333, "learning_rate": 4.5661761922064126e-05, "loss": 0.1736, "num_input_tokens_seen": 11592000, "step": 11510 }, { "epoch": 5.429042904290429, "grad_norm": 0.6184743046760559, "learning_rate": 4.5655969357899874e-05, "loss": 0.2737, "num_input_tokens_seen": 11597088, "step": 11515 }, { "epoch": 5.4314002828854315, "grad_norm": 1.4839509725570679, "learning_rate": 4.565017329703072e-05, "loss": 0.1564, "num_input_tokens_seen": 11601568, "step": 11520 }, { "epoch": 5.433757661480434, "grad_norm": 0.08531443774700165, "learning_rate": 4.5644373740437855e-05, "loss": 0.0851, "num_input_tokens_seen": 11606752, "step": 11525 }, { "epoch": 5.436115040075436, "grad_norm": 0.6356407999992371, "learning_rate": 4.5638570689103036e-05, "loss": 0.1217, "num_input_tokens_seen": 11612288, "step": 11530 }, { "epoch": 5.438472418670439, "grad_norm": 2.8869354724884033, "learning_rate": 4.563276414400861e-05, "loss": 0.2124, "num_input_tokens_seen": 11617728, "step": 11535 }, { "epoch": 5.440829797265441, "grad_norm": 0.20119106769561768, "learning_rate": 4.5626954106137554e-05, "loss": 0.0763, "num_input_tokens_seen": 11623040, "step": 11540 }, { "epoch": 5.4431871758604435, "grad_norm": 0.688445508480072, "learning_rate": 4.5621140576473384e-05, "loss": 0.1053, "num_input_tokens_seen": 11628416, "step": 11545 }, { "epoch": 5.445544554455446, "grad_norm": 0.811418890953064, "learning_rate": 4.5615323556000246e-05, "loss": 0.1233, "num_input_tokens_seen": 11633312, "step": 11550 }, { "epoch": 5.4479019330504475, "grad_norm": 0.10935428738594055, "learning_rate": 4.560950304570285e-05, "loss": 0.1378, "num_input_tokens_seen": 11637920, "step": 11555 }, { "epoch": 5.45025931164545, "grad_norm": 1.3864758014678955, "learning_rate": 4.5603679046566525e-05, "loss": 0.1072, "num_input_tokens_seen": 11643392, "step": 11560 }, { "epoch": 5.452616690240452, "grad_norm": 0.5329928994178772, "learning_rate": 4.559785155957717e-05, "loss": 0.1023, "num_input_tokens_seen": 11648064, "step": 11565 }, { "epoch": 5.454974068835455, "grad_norm": 2.1272501945495605, "learning_rate": 4.559202058572127e-05, "loss": 0.4234, "num_input_tokens_seen": 11654752, "step": 11570 }, { "epoch": 5.457331447430457, "grad_norm": 0.5255856513977051, "learning_rate": 4.558618612598593e-05, "loss": 0.0786, "num_input_tokens_seen": 11659680, "step": 11575 }, { "epoch": 5.4596888260254595, "grad_norm": 0.21410571038722992, "learning_rate": 4.55803481813588e-05, "loss": 0.1033, "num_input_tokens_seen": 11664064, "step": 11580 }, { "epoch": 5.462046204620462, "grad_norm": 0.4439372420310974, "learning_rate": 4.5574506752828164e-05, "loss": 0.2388, "num_input_tokens_seen": 11669760, "step": 11585 }, { "epoch": 5.464403583215464, "grad_norm": 0.12559327483177185, "learning_rate": 4.556866184138287e-05, "loss": 0.1797, "num_input_tokens_seen": 11673952, "step": 11590 }, { "epoch": 5.466760961810467, "grad_norm": 0.15000544488430023, "learning_rate": 4.5562813448012354e-05, "loss": 0.1113, "num_input_tokens_seen": 11679040, "step": 11595 }, { "epoch": 5.469118340405469, "grad_norm": 0.6415334343910217, "learning_rate": 4.555696157370667e-05, "loss": 0.0559, "num_input_tokens_seen": 11683872, "step": 11600 }, { "epoch": 5.4714757190004715, "grad_norm": 2.303415536880493, "learning_rate": 4.555110621945642e-05, "loss": 0.2195, "num_input_tokens_seen": 11689152, "step": 11605 }, { "epoch": 5.473833097595474, "grad_norm": 1.0810377597808838, "learning_rate": 4.554524738625283e-05, "loss": 0.1179, "num_input_tokens_seen": 11694208, "step": 11610 }, { "epoch": 5.476190476190476, "grad_norm": 1.1352567672729492, "learning_rate": 4.5539385075087694e-05, "loss": 0.2087, "num_input_tokens_seen": 11701280, "step": 11615 }, { "epoch": 5.478547854785479, "grad_norm": 1.3011806011199951, "learning_rate": 4.55335192869534e-05, "loss": 0.1841, "num_input_tokens_seen": 11706208, "step": 11620 }, { "epoch": 5.480905233380481, "grad_norm": 0.04061314836144447, "learning_rate": 4.552765002284294e-05, "loss": 0.084, "num_input_tokens_seen": 11710400, "step": 11625 }, { "epoch": 5.4832626119754835, "grad_norm": 0.1397678554058075, "learning_rate": 4.552177728374987e-05, "loss": 0.25, "num_input_tokens_seen": 11716640, "step": 11630 }, { "epoch": 5.485619990570486, "grad_norm": 0.3713015019893646, "learning_rate": 4.551590107066833e-05, "loss": 0.1217, "num_input_tokens_seen": 11721824, "step": 11635 }, { "epoch": 5.487977369165488, "grad_norm": 0.5338589549064636, "learning_rate": 4.5510021384593094e-05, "loss": 0.1324, "num_input_tokens_seen": 11727904, "step": 11640 }, { "epoch": 5.490334747760491, "grad_norm": 0.7151191234588623, "learning_rate": 4.5504138226519474e-05, "loss": 0.3763, "num_input_tokens_seen": 11733184, "step": 11645 }, { "epoch": 5.492692126355493, "grad_norm": 3.7462615966796875, "learning_rate": 4.5498251597443384e-05, "loss": 0.1878, "num_input_tokens_seen": 11737792, "step": 11650 }, { "epoch": 5.4950495049504955, "grad_norm": 0.24551251530647278, "learning_rate": 4.5492361498361353e-05, "loss": 0.0728, "num_input_tokens_seen": 11741728, "step": 11655 }, { "epoch": 5.497406883545497, "grad_norm": 0.16685831546783447, "learning_rate": 4.548646793027046e-05, "loss": 0.0645, "num_input_tokens_seen": 11746336, "step": 11660 }, { "epoch": 5.499764262140499, "grad_norm": 0.9716968536376953, "learning_rate": 4.548057089416837e-05, "loss": 0.1758, "num_input_tokens_seen": 11752480, "step": 11665 }, { "epoch": 5.502121640735502, "grad_norm": 0.13734936714172363, "learning_rate": 4.5474670391053385e-05, "loss": 0.179, "num_input_tokens_seen": 11757120, "step": 11670 }, { "epoch": 5.504479019330504, "grad_norm": 0.3905954658985138, "learning_rate": 4.546876642192434e-05, "loss": 0.3097, "num_input_tokens_seen": 11760896, "step": 11675 }, { "epoch": 5.506836397925507, "grad_norm": 0.5163979530334473, "learning_rate": 4.546285898778068e-05, "loss": 0.0914, "num_input_tokens_seen": 11765888, "step": 11680 }, { "epoch": 5.509193776520509, "grad_norm": 0.2927565574645996, "learning_rate": 4.545694808962243e-05, "loss": 0.0951, "num_input_tokens_seen": 11770432, "step": 11685 }, { "epoch": 5.511551155115511, "grad_norm": 2.4610893726348877, "learning_rate": 4.54510337284502e-05, "loss": 0.26, "num_input_tokens_seen": 11776736, "step": 11690 }, { "epoch": 5.513908533710514, "grad_norm": 0.37156906723976135, "learning_rate": 4.544511590526521e-05, "loss": 0.0824, "num_input_tokens_seen": 11781792, "step": 11695 }, { "epoch": 5.516265912305516, "grad_norm": 0.20920546352863312, "learning_rate": 4.543919462106922e-05, "loss": 0.1828, "num_input_tokens_seen": 11786912, "step": 11700 }, { "epoch": 5.518623290900519, "grad_norm": 0.0493609718978405, "learning_rate": 4.5433269876864634e-05, "loss": 0.1622, "num_input_tokens_seen": 11792960, "step": 11705 }, { "epoch": 5.520980669495521, "grad_norm": 0.42379727959632874, "learning_rate": 4.54273416736544e-05, "loss": 0.1506, "num_input_tokens_seen": 11797408, "step": 11710 }, { "epoch": 5.523338048090523, "grad_norm": 1.6782636642456055, "learning_rate": 4.5421410012442046e-05, "loss": 0.1211, "num_input_tokens_seen": 11802176, "step": 11715 }, { "epoch": 5.525695426685526, "grad_norm": 1.4705439805984497, "learning_rate": 4.541547489423171e-05, "loss": 0.4307, "num_input_tokens_seen": 11806976, "step": 11720 }, { "epoch": 5.528052805280528, "grad_norm": 1.0286091566085815, "learning_rate": 4.540953632002812e-05, "loss": 0.0883, "num_input_tokens_seen": 11811200, "step": 11725 }, { "epoch": 5.530410183875531, "grad_norm": 2.1975209712982178, "learning_rate": 4.540359429083656e-05, "loss": 0.1915, "num_input_tokens_seen": 11816480, "step": 11730 }, { "epoch": 5.532767562470533, "grad_norm": 0.3150268793106079, "learning_rate": 4.5397648807662915e-05, "loss": 0.0616, "num_input_tokens_seen": 11821696, "step": 11735 }, { "epoch": 5.535124941065535, "grad_norm": 0.18966273963451385, "learning_rate": 4.539169987151367e-05, "loss": 0.2454, "num_input_tokens_seen": 11826464, "step": 11740 }, { "epoch": 5.537482319660538, "grad_norm": 0.4848150908946991, "learning_rate": 4.538574748339586e-05, "loss": 0.1181, "num_input_tokens_seen": 11832512, "step": 11745 }, { "epoch": 5.539839698255539, "grad_norm": 0.16870395839214325, "learning_rate": 4.537979164431713e-05, "loss": 0.0934, "num_input_tokens_seen": 11837056, "step": 11750 }, { "epoch": 5.542197076850542, "grad_norm": 0.9373466372489929, "learning_rate": 4.537383235528571e-05, "loss": 0.0944, "num_input_tokens_seen": 11842240, "step": 11755 }, { "epoch": 5.544554455445544, "grad_norm": 0.30836379528045654, "learning_rate": 4.53678696173104e-05, "loss": 0.186, "num_input_tokens_seen": 11847744, "step": 11760 }, { "epoch": 5.5469118340405466, "grad_norm": 0.2392689287662506, "learning_rate": 4.536190343140059e-05, "loss": 0.0987, "num_input_tokens_seen": 11852384, "step": 11765 }, { "epoch": 5.549269212635549, "grad_norm": 1.4856200218200684, "learning_rate": 4.5355933798566245e-05, "loss": 0.2112, "num_input_tokens_seen": 11857504, "step": 11770 }, { "epoch": 5.551626591230551, "grad_norm": 0.934321939945221, "learning_rate": 4.534996071981794e-05, "loss": 0.0814, "num_input_tokens_seen": 11862304, "step": 11775 }, { "epoch": 5.553983969825554, "grad_norm": 0.07666109502315521, "learning_rate": 4.5343984196166814e-05, "loss": 0.0841, "num_input_tokens_seen": 11866880, "step": 11780 }, { "epoch": 5.556341348420556, "grad_norm": 0.17654383182525635, "learning_rate": 4.533800422862458e-05, "loss": 0.2915, "num_input_tokens_seen": 11872064, "step": 11785 }, { "epoch": 5.558698727015559, "grad_norm": 0.14926141500473022, "learning_rate": 4.5332020818203544e-05, "loss": 0.0792, "num_input_tokens_seen": 11876864, "step": 11790 }, { "epoch": 5.561056105610561, "grad_norm": 0.1793426275253296, "learning_rate": 4.5326033965916606e-05, "loss": 0.0543, "num_input_tokens_seen": 11883136, "step": 11795 }, { "epoch": 5.563413484205563, "grad_norm": 0.6626551747322083, "learning_rate": 4.532004367277723e-05, "loss": 0.1749, "num_input_tokens_seen": 11887424, "step": 11800 }, { "epoch": 5.565770862800566, "grad_norm": 0.9330992102622986, "learning_rate": 4.5314049939799475e-05, "loss": 0.2219, "num_input_tokens_seen": 11891968, "step": 11805 }, { "epoch": 5.568128241395568, "grad_norm": 2.069986343383789, "learning_rate": 4.530805276799798e-05, "loss": 0.436, "num_input_tokens_seen": 11897408, "step": 11810 }, { "epoch": 5.570485619990571, "grad_norm": 0.1444053053855896, "learning_rate": 4.530205215838796e-05, "loss": 0.1224, "num_input_tokens_seen": 11901696, "step": 11815 }, { "epoch": 5.572842998585573, "grad_norm": 0.5590082406997681, "learning_rate": 4.529604811198523e-05, "loss": 0.1178, "num_input_tokens_seen": 11907104, "step": 11820 }, { "epoch": 5.575200377180575, "grad_norm": 0.9607920050621033, "learning_rate": 4.5290040629806155e-05, "loss": 0.1454, "num_input_tokens_seen": 11912704, "step": 11825 }, { "epoch": 5.577557755775578, "grad_norm": 0.49402838945388794, "learning_rate": 4.528402971286771e-05, "loss": 0.1174, "num_input_tokens_seen": 11918656, "step": 11830 }, { "epoch": 5.57991513437058, "grad_norm": 0.1613006442785263, "learning_rate": 4.527801536218743e-05, "loss": 0.1173, "num_input_tokens_seen": 11923360, "step": 11835 }, { "epoch": 5.582272512965583, "grad_norm": 0.5179317593574524, "learning_rate": 4.5271997578783455e-05, "loss": 0.1191, "num_input_tokens_seen": 11928064, "step": 11840 }, { "epoch": 5.584629891560585, "grad_norm": 0.38914620876312256, "learning_rate": 4.526597636367449e-05, "loss": 0.131, "num_input_tokens_seen": 11934912, "step": 11845 }, { "epoch": 5.586987270155587, "grad_norm": 1.4485527276992798, "learning_rate": 4.525995171787982e-05, "loss": 0.0952, "num_input_tokens_seen": 11939456, "step": 11850 }, { "epoch": 5.58934464875059, "grad_norm": 1.3009605407714844, "learning_rate": 4.525392364241932e-05, "loss": 0.2726, "num_input_tokens_seen": 11943904, "step": 11855 }, { "epoch": 5.591702027345592, "grad_norm": 0.09189406037330627, "learning_rate": 4.5247892138313435e-05, "loss": 0.0483, "num_input_tokens_seen": 11951008, "step": 11860 }, { "epoch": 5.594059405940594, "grad_norm": 0.19712629914283752, "learning_rate": 4.5241857206583205e-05, "loss": 0.077, "num_input_tokens_seen": 11956352, "step": 11865 }, { "epoch": 5.596416784535596, "grad_norm": 0.04366704076528549, "learning_rate": 4.5235818848250236e-05, "loss": 0.2397, "num_input_tokens_seen": 11961024, "step": 11870 }, { "epoch": 5.5987741631305985, "grad_norm": 0.08945287019014359, "learning_rate": 4.522977706433672e-05, "loss": 0.0616, "num_input_tokens_seen": 11966496, "step": 11875 }, { "epoch": 5.601131541725601, "grad_norm": 2.038158893585205, "learning_rate": 4.522373185586544e-05, "loss": 0.2272, "num_input_tokens_seen": 11972608, "step": 11880 }, { "epoch": 5.603488920320603, "grad_norm": 0.8652018904685974, "learning_rate": 4.521768322385972e-05, "loss": 0.0551, "num_input_tokens_seen": 11977664, "step": 11885 }, { "epoch": 5.605846298915606, "grad_norm": 1.1290127038955688, "learning_rate": 4.521163116934351e-05, "loss": 0.3241, "num_input_tokens_seen": 11982400, "step": 11890 }, { "epoch": 5.608203677510608, "grad_norm": 0.04070322960615158, "learning_rate": 4.520557569334133e-05, "loss": 0.1546, "num_input_tokens_seen": 11987968, "step": 11895 }, { "epoch": 5.6105610561056105, "grad_norm": 1.4846510887145996, "learning_rate": 4.5199516796878246e-05, "loss": 0.2735, "num_input_tokens_seen": 11993120, "step": 11900 }, { "epoch": 5.612918434700613, "grad_norm": 2.2052009105682373, "learning_rate": 4.5193454480979935e-05, "loss": 0.2493, "num_input_tokens_seen": 11998400, "step": 11905 }, { "epoch": 5.615275813295615, "grad_norm": 0.2510222792625427, "learning_rate": 4.518738874667265e-05, "loss": 0.0739, "num_input_tokens_seen": 12002592, "step": 11910 }, { "epoch": 5.617633191890618, "grad_norm": 0.45009395480155945, "learning_rate": 4.518131959498321e-05, "loss": 0.2312, "num_input_tokens_seen": 12007744, "step": 11915 }, { "epoch": 5.61999057048562, "grad_norm": 1.5385756492614746, "learning_rate": 4.517524702693902e-05, "loss": 0.301, "num_input_tokens_seen": 12013024, "step": 11920 }, { "epoch": 5.6223479490806225, "grad_norm": 0.4875064790248871, "learning_rate": 4.516917104356807e-05, "loss": 0.1224, "num_input_tokens_seen": 12018016, "step": 11925 }, { "epoch": 5.624705327675625, "grad_norm": 0.3805006444454193, "learning_rate": 4.516309164589891e-05, "loss": 0.0551, "num_input_tokens_seen": 12023008, "step": 11930 }, { "epoch": 5.627062706270627, "grad_norm": 0.15370453894138336, "learning_rate": 4.515700883496069e-05, "loss": 0.1088, "num_input_tokens_seen": 12027808, "step": 11935 }, { "epoch": 5.62942008486563, "grad_norm": 0.6280245780944824, "learning_rate": 4.515092261178312e-05, "loss": 0.1101, "num_input_tokens_seen": 12032736, "step": 11940 }, { "epoch": 5.631777463460632, "grad_norm": 1.2162996530532837, "learning_rate": 4.514483297739649e-05, "loss": 0.2142, "num_input_tokens_seen": 12037824, "step": 11945 }, { "epoch": 5.634134842055634, "grad_norm": 0.2628955841064453, "learning_rate": 4.5138739932831674e-05, "loss": 0.271, "num_input_tokens_seen": 12042368, "step": 11950 }, { "epoch": 5.636492220650636, "grad_norm": 0.4715646505355835, "learning_rate": 4.513264347912013e-05, "loss": 0.1187, "num_input_tokens_seen": 12047584, "step": 11955 }, { "epoch": 5.6388495992456384, "grad_norm": 1.3183525800704956, "learning_rate": 4.512654361729388e-05, "loss": 0.0988, "num_input_tokens_seen": 12051968, "step": 11960 }, { "epoch": 5.641206977840641, "grad_norm": 0.38612765073776245, "learning_rate": 4.5120440348385524e-05, "loss": 0.0926, "num_input_tokens_seen": 12057216, "step": 11965 }, { "epoch": 5.643564356435643, "grad_norm": 0.327724814414978, "learning_rate": 4.511433367342824e-05, "loss": 0.0876, "num_input_tokens_seen": 12062368, "step": 11970 }, { "epoch": 5.645921735030646, "grad_norm": 0.8658179640769958, "learning_rate": 4.510822359345579e-05, "loss": 0.1888, "num_input_tokens_seen": 12066368, "step": 11975 }, { "epoch": 5.648279113625648, "grad_norm": 0.15798459947109222, "learning_rate": 4.510211010950249e-05, "loss": 0.055, "num_input_tokens_seen": 12070816, "step": 11980 }, { "epoch": 5.6506364922206505, "grad_norm": 1.9733517169952393, "learning_rate": 4.5095993222603285e-05, "loss": 0.1566, "num_input_tokens_seen": 12075744, "step": 11985 }, { "epoch": 5.652993870815653, "grad_norm": 0.49479714035987854, "learning_rate": 4.508987293379362e-05, "loss": 0.2926, "num_input_tokens_seen": 12081184, "step": 11990 }, { "epoch": 5.655351249410655, "grad_norm": 0.33630990982055664, "learning_rate": 4.508374924410959e-05, "loss": 0.066, "num_input_tokens_seen": 12086144, "step": 11995 }, { "epoch": 5.657708628005658, "grad_norm": 0.8318001627922058, "learning_rate": 4.50776221545878e-05, "loss": 0.1806, "num_input_tokens_seen": 12090528, "step": 12000 }, { "epoch": 5.66006600660066, "grad_norm": 0.37764841318130493, "learning_rate": 4.5071491666265483e-05, "loss": 0.0687, "num_input_tokens_seen": 12094688, "step": 12005 }, { "epoch": 5.6624233851956625, "grad_norm": 0.21449178457260132, "learning_rate": 4.5065357780180424e-05, "loss": 0.0612, "num_input_tokens_seen": 12100352, "step": 12010 }, { "epoch": 5.664780763790665, "grad_norm": 0.16045618057250977, "learning_rate": 4.505922049737098e-05, "loss": 0.0912, "num_input_tokens_seen": 12104768, "step": 12015 }, { "epoch": 5.667138142385667, "grad_norm": 0.0591387078166008, "learning_rate": 4.50530798188761e-05, "loss": 0.1223, "num_input_tokens_seen": 12109568, "step": 12020 }, { "epoch": 5.66949552098067, "grad_norm": 1.6686551570892334, "learning_rate": 4.504693574573528e-05, "loss": 0.1768, "num_input_tokens_seen": 12114080, "step": 12025 }, { "epoch": 5.671852899575672, "grad_norm": 0.2404339611530304, "learning_rate": 4.504078827898862e-05, "loss": 0.0639, "num_input_tokens_seen": 12118624, "step": 12030 }, { "epoch": 5.6742102781706745, "grad_norm": 0.2649906277656555, "learning_rate": 4.503463741967677e-05, "loss": 0.1652, "num_input_tokens_seen": 12123232, "step": 12035 }, { "epoch": 5.676567656765677, "grad_norm": 1.499467134475708, "learning_rate": 4.5028483168840986e-05, "loss": 0.2648, "num_input_tokens_seen": 12128480, "step": 12040 }, { "epoch": 5.678925035360679, "grad_norm": 1.792858362197876, "learning_rate": 4.502232552752306e-05, "loss": 0.0751, "num_input_tokens_seen": 12133600, "step": 12045 }, { "epoch": 5.681282413955682, "grad_norm": 1.64610755443573, "learning_rate": 4.501616449676538e-05, "loss": 0.2601, "num_input_tokens_seen": 12137920, "step": 12050 }, { "epoch": 5.683639792550684, "grad_norm": 0.1885506510734558, "learning_rate": 4.50100000776109e-05, "loss": 0.0323, "num_input_tokens_seen": 12143008, "step": 12055 }, { "epoch": 5.6859971711456865, "grad_norm": 0.06847196817398071, "learning_rate": 4.5003832271103166e-05, "loss": 0.1492, "num_input_tokens_seen": 12147328, "step": 12060 }, { "epoch": 5.688354549740688, "grad_norm": 0.36772045493125916, "learning_rate": 4.499766107828628e-05, "loss": 0.0764, "num_input_tokens_seen": 12152960, "step": 12065 }, { "epoch": 5.69071192833569, "grad_norm": 0.030689314007759094, "learning_rate": 4.4991486500204905e-05, "loss": 0.1086, "num_input_tokens_seen": 12157952, "step": 12070 }, { "epoch": 5.693069306930693, "grad_norm": 0.41158032417297363, "learning_rate": 4.49853085379043e-05, "loss": 0.2238, "num_input_tokens_seen": 12163136, "step": 12075 }, { "epoch": 5.695426685525695, "grad_norm": 1.0257095098495483, "learning_rate": 4.49791271924303e-05, "loss": 0.1214, "num_input_tokens_seen": 12168512, "step": 12080 }, { "epoch": 5.697784064120698, "grad_norm": 1.3431813716888428, "learning_rate": 4.4972942464829285e-05, "loss": 0.1296, "num_input_tokens_seen": 12173984, "step": 12085 }, { "epoch": 5.7001414427157, "grad_norm": 0.6704437136650085, "learning_rate": 4.4966754356148235e-05, "loss": 0.1266, "num_input_tokens_seen": 12178656, "step": 12090 }, { "epoch": 5.702498821310702, "grad_norm": 0.0645926371216774, "learning_rate": 4.4960562867434685e-05, "loss": 0.2383, "num_input_tokens_seen": 12184224, "step": 12095 }, { "epoch": 5.704856199905705, "grad_norm": 0.05597847327589989, "learning_rate": 4.495436799973676e-05, "loss": 0.1149, "num_input_tokens_seen": 12189600, "step": 12100 }, { "epoch": 5.707213578500707, "grad_norm": 1.3454419374465942, "learning_rate": 4.494816975410313e-05, "loss": 0.1094, "num_input_tokens_seen": 12194208, "step": 12105 }, { "epoch": 5.70957095709571, "grad_norm": 0.04780229926109314, "learning_rate": 4.494196813158307e-05, "loss": 0.0919, "num_input_tokens_seen": 12198368, "step": 12110 }, { "epoch": 5.711928335690712, "grad_norm": 2.3741743564605713, "learning_rate": 4.4935763133226393e-05, "loss": 0.3229, "num_input_tokens_seen": 12202816, "step": 12115 }, { "epoch": 5.714285714285714, "grad_norm": 0.16822393238544464, "learning_rate": 4.4929554760083516e-05, "loss": 0.1405, "num_input_tokens_seen": 12207456, "step": 12120 }, { "epoch": 5.716643092880717, "grad_norm": 0.35187116265296936, "learning_rate": 4.492334301320539e-05, "loss": 0.1398, "num_input_tokens_seen": 12212224, "step": 12125 }, { "epoch": 5.719000471475719, "grad_norm": 0.8698214292526245, "learning_rate": 4.491712789364358e-05, "loss": 0.1366, "num_input_tokens_seen": 12217056, "step": 12130 }, { "epoch": 5.721357850070722, "grad_norm": 3.5907087326049805, "learning_rate": 4.491090940245018e-05, "loss": 0.1265, "num_input_tokens_seen": 12221696, "step": 12135 }, { "epoch": 5.723715228665724, "grad_norm": 0.6190409064292908, "learning_rate": 4.49046875406779e-05, "loss": 0.1636, "num_input_tokens_seen": 12227040, "step": 12140 }, { "epoch": 5.726072607260726, "grad_norm": 1.7212579250335693, "learning_rate": 4.489846230937998e-05, "loss": 0.1397, "num_input_tokens_seen": 12231360, "step": 12145 }, { "epoch": 5.728429985855728, "grad_norm": 0.33226075768470764, "learning_rate": 4.489223370961025e-05, "loss": 0.1745, "num_input_tokens_seen": 12236768, "step": 12150 }, { "epoch": 5.73078736445073, "grad_norm": 0.9342212080955505, "learning_rate": 4.48860017424231e-05, "loss": 0.0955, "num_input_tokens_seen": 12241120, "step": 12155 }, { "epoch": 5.733144743045733, "grad_norm": 0.2156396359205246, "learning_rate": 4.4879766408873494e-05, "loss": 0.1094, "num_input_tokens_seen": 12245664, "step": 12160 }, { "epoch": 5.735502121640735, "grad_norm": 0.09243413060903549, "learning_rate": 4.487352771001698e-05, "loss": 0.2817, "num_input_tokens_seen": 12250336, "step": 12165 }, { "epoch": 5.7378595002357375, "grad_norm": 0.9869149923324585, "learning_rate": 4.486728564690966e-05, "loss": 0.153, "num_input_tokens_seen": 12255168, "step": 12170 }, { "epoch": 5.74021687883074, "grad_norm": 0.3022141456604004, "learning_rate": 4.48610402206082e-05, "loss": 0.2676, "num_input_tokens_seen": 12260544, "step": 12175 }, { "epoch": 5.742574257425742, "grad_norm": 0.23712709546089172, "learning_rate": 4.4854791432169855e-05, "loss": 0.1299, "num_input_tokens_seen": 12264512, "step": 12180 }, { "epoch": 5.744931636020745, "grad_norm": 0.0657973662018776, "learning_rate": 4.4848539282652444e-05, "loss": 0.0815, "num_input_tokens_seen": 12269856, "step": 12185 }, { "epoch": 5.747289014615747, "grad_norm": 0.2601316571235657, "learning_rate": 4.484228377311433e-05, "loss": 0.0942, "num_input_tokens_seen": 12275744, "step": 12190 }, { "epoch": 5.7496463932107496, "grad_norm": 0.8761990070343018, "learning_rate": 4.483602490461448e-05, "loss": 0.0595, "num_input_tokens_seen": 12280672, "step": 12195 }, { "epoch": 5.752003771805752, "grad_norm": 1.5353121757507324, "learning_rate": 4.482976267821241e-05, "loss": 0.3436, "num_input_tokens_seen": 12286464, "step": 12200 }, { "epoch": 5.754361150400754, "grad_norm": 0.5764215588569641, "learning_rate": 4.482349709496821e-05, "loss": 0.3628, "num_input_tokens_seen": 12291712, "step": 12205 }, { "epoch": 5.756718528995757, "grad_norm": 0.37274280190467834, "learning_rate": 4.481722815594254e-05, "loss": 0.0552, "num_input_tokens_seen": 12296288, "step": 12210 }, { "epoch": 5.759075907590759, "grad_norm": 0.4571888744831085, "learning_rate": 4.481095586219661e-05, "loss": 0.1413, "num_input_tokens_seen": 12301856, "step": 12215 }, { "epoch": 5.761433286185762, "grad_norm": 0.5960550308227539, "learning_rate": 4.480468021479223e-05, "loss": 0.3117, "num_input_tokens_seen": 12306528, "step": 12220 }, { "epoch": 5.763790664780764, "grad_norm": 1.2119630575180054, "learning_rate": 4.479840121479175e-05, "loss": 0.247, "num_input_tokens_seen": 12311584, "step": 12225 }, { "epoch": 5.766148043375766, "grad_norm": 0.13246919214725494, "learning_rate": 4.4792118863258104e-05, "loss": 0.1281, "num_input_tokens_seen": 12316000, "step": 12230 }, { "epoch": 5.768505421970769, "grad_norm": 1.0873653888702393, "learning_rate": 4.478583316125479e-05, "loss": 0.258, "num_input_tokens_seen": 12321248, "step": 12235 }, { "epoch": 5.770862800565771, "grad_norm": 0.13563720881938934, "learning_rate": 4.477954410984586e-05, "loss": 0.0435, "num_input_tokens_seen": 12326272, "step": 12240 }, { "epoch": 5.773220179160774, "grad_norm": 0.5925991535186768, "learning_rate": 4.4773251710095956e-05, "loss": 0.0428, "num_input_tokens_seen": 12333472, "step": 12245 }, { "epoch": 5.775577557755776, "grad_norm": 2.2885115146636963, "learning_rate": 4.4766955963070265e-05, "loss": 0.2882, "num_input_tokens_seen": 12338848, "step": 12250 }, { "epoch": 5.777934936350778, "grad_norm": 2.1220855712890625, "learning_rate": 4.476065686983455e-05, "loss": 0.2198, "num_input_tokens_seen": 12344992, "step": 12255 }, { "epoch": 5.780292314945781, "grad_norm": 0.9894759654998779, "learning_rate": 4.4754354431455145e-05, "loss": 0.1109, "num_input_tokens_seen": 12350976, "step": 12260 }, { "epoch": 5.782649693540782, "grad_norm": 0.9661554098129272, "learning_rate": 4.474804864899895e-05, "loss": 0.1791, "num_input_tokens_seen": 12355776, "step": 12265 }, { "epoch": 5.785007072135785, "grad_norm": 0.0643625259399414, "learning_rate": 4.474173952353342e-05, "loss": 0.0463, "num_input_tokens_seen": 12360864, "step": 12270 }, { "epoch": 5.787364450730787, "grad_norm": 0.8409623503684998, "learning_rate": 4.473542705612658e-05, "loss": 0.1661, "num_input_tokens_seen": 12365472, "step": 12275 }, { "epoch": 5.7897218293257895, "grad_norm": 0.7376725673675537, "learning_rate": 4.4729111247847026e-05, "loss": 0.0439, "num_input_tokens_seen": 12369280, "step": 12280 }, { "epoch": 5.792079207920792, "grad_norm": 1.0935238599777222, "learning_rate": 4.472279209976393e-05, "loss": 0.1447, "num_input_tokens_seen": 12373600, "step": 12285 }, { "epoch": 5.794436586515794, "grad_norm": 0.6133126020431519, "learning_rate": 4.4716469612946996e-05, "loss": 0.074, "num_input_tokens_seen": 12377440, "step": 12290 }, { "epoch": 5.796793965110797, "grad_norm": 0.2582428753376007, "learning_rate": 4.471014378846652e-05, "loss": 0.0253, "num_input_tokens_seen": 12382688, "step": 12295 }, { "epoch": 5.799151343705799, "grad_norm": 0.36669987440109253, "learning_rate": 4.470381462739337e-05, "loss": 0.1954, "num_input_tokens_seen": 12387104, "step": 12300 }, { "epoch": 5.8015087223008015, "grad_norm": 2.113736391067505, "learning_rate": 4.469748213079895e-05, "loss": 0.212, "num_input_tokens_seen": 12391968, "step": 12305 }, { "epoch": 5.803866100895804, "grad_norm": 0.4600358307361603, "learning_rate": 4.469114629975525e-05, "loss": 0.1005, "num_input_tokens_seen": 12398496, "step": 12310 }, { "epoch": 5.806223479490806, "grad_norm": 1.586434006690979, "learning_rate": 4.468480713533481e-05, "loss": 0.251, "num_input_tokens_seen": 12404128, "step": 12315 }, { "epoch": 5.808580858085809, "grad_norm": 0.4436407685279846, "learning_rate": 4.467846463861077e-05, "loss": 0.0742, "num_input_tokens_seen": 12408960, "step": 12320 }, { "epoch": 5.810938236680811, "grad_norm": 1.0438992977142334, "learning_rate": 4.467211881065677e-05, "loss": 0.085, "num_input_tokens_seen": 12413440, "step": 12325 }, { "epoch": 5.8132956152758135, "grad_norm": 2.164027452468872, "learning_rate": 4.466576965254708e-05, "loss": 0.1935, "num_input_tokens_seen": 12417376, "step": 12330 }, { "epoch": 5.815652993870816, "grad_norm": 0.3846036195755005, "learning_rate": 4.4659417165356486e-05, "loss": 0.1096, "num_input_tokens_seen": 12422784, "step": 12335 }, { "epoch": 5.818010372465818, "grad_norm": 0.7735828161239624, "learning_rate": 4.465306135016037e-05, "loss": 0.0786, "num_input_tokens_seen": 12427936, "step": 12340 }, { "epoch": 5.820367751060821, "grad_norm": 0.3422633111476898, "learning_rate": 4.464670220803466e-05, "loss": 0.0804, "num_input_tokens_seen": 12432928, "step": 12345 }, { "epoch": 5.822725129655822, "grad_norm": 0.04550521820783615, "learning_rate": 4.464033974005584e-05, "loss": 0.0729, "num_input_tokens_seen": 12438176, "step": 12350 }, { "epoch": 5.825082508250825, "grad_norm": 0.2984668016433716, "learning_rate": 4.463397394730098e-05, "loss": 0.1781, "num_input_tokens_seen": 12443616, "step": 12355 }, { "epoch": 5.827439886845827, "grad_norm": 0.7754859328269958, "learning_rate": 4.462760483084769e-05, "loss": 0.3124, "num_input_tokens_seen": 12448416, "step": 12360 }, { "epoch": 5.829797265440829, "grad_norm": 0.8621153831481934, "learning_rate": 4.462123239177417e-05, "loss": 0.0538, "num_input_tokens_seen": 12453856, "step": 12365 }, { "epoch": 5.832154644035832, "grad_norm": 0.32580602169036865, "learning_rate": 4.461485663115915e-05, "loss": 0.1577, "num_input_tokens_seen": 12458016, "step": 12370 }, { "epoch": 5.834512022630834, "grad_norm": 0.3066844046115875, "learning_rate": 4.460847755008195e-05, "loss": 0.147, "num_input_tokens_seen": 12463744, "step": 12375 }, { "epoch": 5.836869401225837, "grad_norm": 0.21199855208396912, "learning_rate": 4.4602095149622425e-05, "loss": 0.2012, "num_input_tokens_seen": 12468672, "step": 12380 }, { "epoch": 5.839226779820839, "grad_norm": 1.6592371463775635, "learning_rate": 4.4595709430861024e-05, "loss": 0.2239, "num_input_tokens_seen": 12472736, "step": 12385 }, { "epoch": 5.841584158415841, "grad_norm": 0.6135969161987305, "learning_rate": 4.458932039487872e-05, "loss": 0.1129, "num_input_tokens_seen": 12476768, "step": 12390 }, { "epoch": 5.843941537010844, "grad_norm": 0.08782748132944107, "learning_rate": 4.458292804275709e-05, "loss": 0.1006, "num_input_tokens_seen": 12482048, "step": 12395 }, { "epoch": 5.846298915605846, "grad_norm": 0.9898231625556946, "learning_rate": 4.457653237557824e-05, "loss": 0.2291, "num_input_tokens_seen": 12488096, "step": 12400 }, { "epoch": 5.848656294200849, "grad_norm": 0.2852109670639038, "learning_rate": 4.457013339442485e-05, "loss": 0.1775, "num_input_tokens_seen": 12493504, "step": 12405 }, { "epoch": 5.851013672795851, "grad_norm": 0.3335608243942261, "learning_rate": 4.456373110038016e-05, "loss": 0.1385, "num_input_tokens_seen": 12498208, "step": 12410 }, { "epoch": 5.8533710513908535, "grad_norm": 0.2849929928779602, "learning_rate": 4.455732549452797e-05, "loss": 0.1246, "num_input_tokens_seen": 12502656, "step": 12415 }, { "epoch": 5.855728429985856, "grad_norm": 0.3603995740413666, "learning_rate": 4.455091657795263e-05, "loss": 0.0645, "num_input_tokens_seen": 12506688, "step": 12420 }, { "epoch": 5.858085808580858, "grad_norm": 0.3778473734855652, "learning_rate": 4.4544504351739076e-05, "loss": 0.1547, "num_input_tokens_seen": 12511744, "step": 12425 }, { "epoch": 5.860443187175861, "grad_norm": 0.3390007019042969, "learning_rate": 4.453808881697278e-05, "loss": 0.1955, "num_input_tokens_seen": 12516032, "step": 12430 }, { "epoch": 5.862800565770863, "grad_norm": 1.2679741382598877, "learning_rate": 4.453166997473978e-05, "loss": 0.1858, "num_input_tokens_seen": 12520736, "step": 12435 }, { "epoch": 5.8651579443658655, "grad_norm": 2.619685173034668, "learning_rate": 4.452524782612669e-05, "loss": 0.171, "num_input_tokens_seen": 12525920, "step": 12440 }, { "epoch": 5.867515322960868, "grad_norm": 0.24812857806682587, "learning_rate": 4.451882237222066e-05, "loss": 0.0859, "num_input_tokens_seen": 12530432, "step": 12445 }, { "epoch": 5.86987270155587, "grad_norm": 0.37929412722587585, "learning_rate": 4.451239361410941e-05, "loss": 0.0888, "num_input_tokens_seen": 12535904, "step": 12450 }, { "epoch": 5.872230080150873, "grad_norm": 0.11977510154247284, "learning_rate": 4.4505961552881224e-05, "loss": 0.0981, "num_input_tokens_seen": 12540800, "step": 12455 }, { "epoch": 5.874587458745875, "grad_norm": 1.3410218954086304, "learning_rate": 4.449952618962494e-05, "loss": 0.2575, "num_input_tokens_seen": 12546304, "step": 12460 }, { "epoch": 5.876944837340877, "grad_norm": 0.7694074511528015, "learning_rate": 4.4493087525429954e-05, "loss": 0.147, "num_input_tokens_seen": 12551680, "step": 12465 }, { "epoch": 5.879302215935879, "grad_norm": 0.26803654432296753, "learning_rate": 4.448664556138623e-05, "loss": 0.0786, "num_input_tokens_seen": 12555936, "step": 12470 }, { "epoch": 5.881659594530881, "grad_norm": 0.5528765320777893, "learning_rate": 4.448020029858427e-05, "loss": 0.0707, "num_input_tokens_seen": 12560512, "step": 12475 }, { "epoch": 5.884016973125884, "grad_norm": 0.5806404948234558, "learning_rate": 4.447375173811516e-05, "loss": 0.1565, "num_input_tokens_seen": 12565056, "step": 12480 }, { "epoch": 5.886374351720886, "grad_norm": 0.4363386631011963, "learning_rate": 4.446729988107052e-05, "loss": 0.0963, "num_input_tokens_seen": 12569344, "step": 12485 }, { "epoch": 5.888731730315889, "grad_norm": 3.3372631072998047, "learning_rate": 4.446084472854255e-05, "loss": 0.0401, "num_input_tokens_seen": 12574112, "step": 12490 }, { "epoch": 5.891089108910891, "grad_norm": 0.2520969808101654, "learning_rate": 4.4454386281623995e-05, "loss": 0.0841, "num_input_tokens_seen": 12578656, "step": 12495 }, { "epoch": 5.893446487505893, "grad_norm": 1.3791368007659912, "learning_rate": 4.4447924541408156e-05, "loss": 0.1735, "num_input_tokens_seen": 12586784, "step": 12500 }, { "epoch": 5.895803866100896, "grad_norm": 1.1677840948104858, "learning_rate": 4.44414595089889e-05, "loss": 0.2458, "num_input_tokens_seen": 12591200, "step": 12505 }, { "epoch": 5.898161244695898, "grad_norm": 0.14389044046401978, "learning_rate": 4.443499118546065e-05, "loss": 0.1242, "num_input_tokens_seen": 12595808, "step": 12510 }, { "epoch": 5.900518623290901, "grad_norm": 0.07651569694280624, "learning_rate": 4.4428519571918376e-05, "loss": 0.0953, "num_input_tokens_seen": 12600288, "step": 12515 }, { "epoch": 5.902876001885903, "grad_norm": 0.6772247552871704, "learning_rate": 4.4422044669457616e-05, "loss": 0.2164, "num_input_tokens_seen": 12605120, "step": 12520 }, { "epoch": 5.905233380480905, "grad_norm": 0.03140570968389511, "learning_rate": 4.441556647917446e-05, "loss": 0.1288, "num_input_tokens_seen": 12610624, "step": 12525 }, { "epoch": 5.907590759075908, "grad_norm": 0.08194462209939957, "learning_rate": 4.4409085002165565e-05, "loss": 0.0514, "num_input_tokens_seen": 12615392, "step": 12530 }, { "epoch": 5.90994813767091, "grad_norm": 1.4555027484893799, "learning_rate": 4.440260023952811e-05, "loss": 0.1798, "num_input_tokens_seen": 12621216, "step": 12535 }, { "epoch": 5.912305516265913, "grad_norm": 0.4048461616039276, "learning_rate": 4.439611219235989e-05, "loss": 0.0959, "num_input_tokens_seen": 12625824, "step": 12540 }, { "epoch": 5.914662894860915, "grad_norm": 1.7619829177856445, "learning_rate": 4.43896208617592e-05, "loss": 0.2497, "num_input_tokens_seen": 12630752, "step": 12545 }, { "epoch": 5.9170202734559165, "grad_norm": 1.360486626625061, "learning_rate": 4.43831262488249e-05, "loss": 0.2289, "num_input_tokens_seen": 12636928, "step": 12550 }, { "epoch": 5.919377652050919, "grad_norm": 1.2426284551620483, "learning_rate": 4.4376628354656444e-05, "loss": 0.0815, "num_input_tokens_seen": 12642880, "step": 12555 }, { "epoch": 5.921735030645921, "grad_norm": 0.027322279289364815, "learning_rate": 4.4370127180353804e-05, "loss": 0.0692, "num_input_tokens_seen": 12647264, "step": 12560 }, { "epoch": 5.924092409240924, "grad_norm": 1.1148749589920044, "learning_rate": 4.4363622727017517e-05, "loss": 0.223, "num_input_tokens_seen": 12651776, "step": 12565 }, { "epoch": 5.926449787835926, "grad_norm": 2.0005311965942383, "learning_rate": 4.435711499574867e-05, "loss": 0.1385, "num_input_tokens_seen": 12655616, "step": 12570 }, { "epoch": 5.9288071664309285, "grad_norm": 0.34694382548332214, "learning_rate": 4.435060398764893e-05, "loss": 0.1834, "num_input_tokens_seen": 12660000, "step": 12575 }, { "epoch": 5.931164545025931, "grad_norm": 0.2306353747844696, "learning_rate": 4.434408970382049e-05, "loss": 0.0607, "num_input_tokens_seen": 12664352, "step": 12580 }, { "epoch": 5.933521923620933, "grad_norm": 1.846367359161377, "learning_rate": 4.4337572145366104e-05, "loss": 0.1268, "num_input_tokens_seen": 12669728, "step": 12585 }, { "epoch": 5.935879302215936, "grad_norm": 0.478509783744812, "learning_rate": 4.433105131338909e-05, "loss": 0.1004, "num_input_tokens_seen": 12674720, "step": 12590 }, { "epoch": 5.938236680810938, "grad_norm": 0.9024797677993774, "learning_rate": 4.4324527208993304e-05, "loss": 0.1385, "num_input_tokens_seen": 12679328, "step": 12595 }, { "epoch": 5.9405940594059405, "grad_norm": 0.03400678560137749, "learning_rate": 4.431799983328317e-05, "loss": 0.1142, "num_input_tokens_seen": 12684000, "step": 12600 }, { "epoch": 5.942951438000943, "grad_norm": 1.4194061756134033, "learning_rate": 4.431146918736368e-05, "loss": 0.1173, "num_input_tokens_seen": 12688896, "step": 12605 }, { "epoch": 5.945308816595945, "grad_norm": 0.20465649664402008, "learning_rate": 4.430493527234034e-05, "loss": 0.0963, "num_input_tokens_seen": 12693952, "step": 12610 }, { "epoch": 5.947666195190948, "grad_norm": 0.11070352047681808, "learning_rate": 4.4298398089319235e-05, "loss": 0.129, "num_input_tokens_seen": 12699104, "step": 12615 }, { "epoch": 5.95002357378595, "grad_norm": 0.8276849985122681, "learning_rate": 4.429185763940701e-05, "loss": 0.3446, "num_input_tokens_seen": 12704896, "step": 12620 }, { "epoch": 5.9523809523809526, "grad_norm": 0.22519400715827942, "learning_rate": 4.428531392371083e-05, "loss": 0.1462, "num_input_tokens_seen": 12709216, "step": 12625 }, { "epoch": 5.954738330975955, "grad_norm": 1.6169967651367188, "learning_rate": 4.427876694333846e-05, "loss": 0.2191, "num_input_tokens_seen": 12714016, "step": 12630 }, { "epoch": 5.957095709570957, "grad_norm": 0.13030609488487244, "learning_rate": 4.427221669939817e-05, "loss": 0.1354, "num_input_tokens_seen": 12718400, "step": 12635 }, { "epoch": 5.95945308816596, "grad_norm": 0.4196666181087494, "learning_rate": 4.426566319299883e-05, "loss": 0.1481, "num_input_tokens_seen": 12723392, "step": 12640 }, { "epoch": 5.961810466760962, "grad_norm": 0.09799220412969589, "learning_rate": 4.4259106425249816e-05, "loss": 0.136, "num_input_tokens_seen": 12728544, "step": 12645 }, { "epoch": 5.964167845355965, "grad_norm": 1.7076776027679443, "learning_rate": 4.4252546397261084e-05, "loss": 0.1844, "num_input_tokens_seen": 12733504, "step": 12650 }, { "epoch": 5.966525223950967, "grad_norm": 0.39322617650032043, "learning_rate": 4.424598311014314e-05, "loss": 0.0644, "num_input_tokens_seen": 12738240, "step": 12655 }, { "epoch": 5.968882602545969, "grad_norm": 1.0516799688339233, "learning_rate": 4.423941656500704e-05, "loss": 0.1359, "num_input_tokens_seen": 12743936, "step": 12660 }, { "epoch": 5.971239981140971, "grad_norm": 0.22415809333324432, "learning_rate": 4.423284676296437e-05, "loss": 0.0866, "num_input_tokens_seen": 12749056, "step": 12665 }, { "epoch": 5.973597359735973, "grad_norm": 1.2402830123901367, "learning_rate": 4.42262737051273e-05, "loss": 0.175, "num_input_tokens_seen": 12754080, "step": 12670 }, { "epoch": 5.975954738330976, "grad_norm": 1.0446244478225708, "learning_rate": 4.421969739260854e-05, "loss": 0.1696, "num_input_tokens_seen": 12760352, "step": 12675 }, { "epoch": 5.978312116925978, "grad_norm": 0.6090846061706543, "learning_rate": 4.4213117826521345e-05, "loss": 0.2081, "num_input_tokens_seen": 12764224, "step": 12680 }, { "epoch": 5.9806694955209805, "grad_norm": 2.163276433944702, "learning_rate": 4.420653500797952e-05, "loss": 0.3048, "num_input_tokens_seen": 12768192, "step": 12685 }, { "epoch": 5.983026874115983, "grad_norm": 0.681005597114563, "learning_rate": 4.419994893809743e-05, "loss": 0.0805, "num_input_tokens_seen": 12773216, "step": 12690 }, { "epoch": 5.985384252710985, "grad_norm": 0.422105997800827, "learning_rate": 4.419335961798997e-05, "loss": 0.114, "num_input_tokens_seen": 12778432, "step": 12695 }, { "epoch": 5.987741631305988, "grad_norm": 1.5995315313339233, "learning_rate": 4.4186767048772616e-05, "loss": 0.1266, "num_input_tokens_seen": 12782976, "step": 12700 }, { "epoch": 5.99009900990099, "grad_norm": 0.12984499335289001, "learning_rate": 4.4180171231561374e-05, "loss": 0.1614, "num_input_tokens_seen": 12789728, "step": 12705 }, { "epoch": 5.9924563884959925, "grad_norm": 0.2890125811100006, "learning_rate": 4.417357216747281e-05, "loss": 0.1225, "num_input_tokens_seen": 12797312, "step": 12710 }, { "epoch": 5.994813767090995, "grad_norm": 0.5566533803939819, "learning_rate": 4.416696985762402e-05, "loss": 0.042, "num_input_tokens_seen": 12801600, "step": 12715 }, { "epoch": 5.997171145685997, "grad_norm": 0.3846069872379303, "learning_rate": 4.4160364303132675e-05, "loss": 0.1121, "num_input_tokens_seen": 12808512, "step": 12720 }, { "epoch": 5.999528524281, "grad_norm": 1.517248511314392, "learning_rate": 4.4153755505116966e-05, "loss": 0.1657, "num_input_tokens_seen": 12812960, "step": 12725 }, { "epoch": 6.0, "eval_loss": 0.15432102978229523, "eval_runtime": 15.086, "eval_samples_per_second": 62.508, "eval_steps_per_second": 15.644, "num_input_tokens_seen": 12814080, "step": 12726 }, { "epoch": 6.001885902876002, "grad_norm": 0.17468991875648499, "learning_rate": 4.414714346469567e-05, "loss": 0.098, "num_input_tokens_seen": 12818112, "step": 12730 }, { "epoch": 6.0042432814710045, "grad_norm": 0.7519697546958923, "learning_rate": 4.414052818298809e-05, "loss": 0.0913, "num_input_tokens_seen": 12824096, "step": 12735 }, { "epoch": 6.006600660066007, "grad_norm": 0.11253329366445541, "learning_rate": 4.413390966111407e-05, "loss": 0.0273, "num_input_tokens_seen": 12829024, "step": 12740 }, { "epoch": 6.008958038661009, "grad_norm": 0.9944649934768677, "learning_rate": 4.412728790019403e-05, "loss": 0.1024, "num_input_tokens_seen": 12834464, "step": 12745 }, { "epoch": 6.011315417256012, "grad_norm": 1.3305145502090454, "learning_rate": 4.4120662901348896e-05, "loss": 0.1407, "num_input_tokens_seen": 12838144, "step": 12750 }, { "epoch": 6.013672795851014, "grad_norm": 0.0827622339129448, "learning_rate": 4.411403466570019e-05, "loss": 0.0272, "num_input_tokens_seen": 12843296, "step": 12755 }, { "epoch": 6.016030174446016, "grad_norm": 2.5358521938323975, "learning_rate": 4.410740319436996e-05, "loss": 0.1595, "num_input_tokens_seen": 12848640, "step": 12760 }, { "epoch": 6.018387553041018, "grad_norm": 0.08855786919593811, "learning_rate": 4.410076848848078e-05, "loss": 0.0854, "num_input_tokens_seen": 12853184, "step": 12765 }, { "epoch": 6.02074493163602, "grad_norm": 0.45377442240715027, "learning_rate": 4.409413054915582e-05, "loss": 0.15, "num_input_tokens_seen": 12857760, "step": 12770 }, { "epoch": 6.023102310231023, "grad_norm": 1.3567107915878296, "learning_rate": 4.4087489377518754e-05, "loss": 0.2163, "num_input_tokens_seen": 12862816, "step": 12775 }, { "epoch": 6.025459688826025, "grad_norm": 1.5449365377426147, "learning_rate": 4.4080844974693816e-05, "loss": 0.1597, "num_input_tokens_seen": 12866912, "step": 12780 }, { "epoch": 6.027817067421028, "grad_norm": 0.3043205738067627, "learning_rate": 4.407419734180581e-05, "loss": 0.0446, "num_input_tokens_seen": 12872480, "step": 12785 }, { "epoch": 6.03017444601603, "grad_norm": 0.4789775311946869, "learning_rate": 4.4067546479980046e-05, "loss": 0.1838, "num_input_tokens_seen": 12878240, "step": 12790 }, { "epoch": 6.032531824611032, "grad_norm": 0.23001894354820251, "learning_rate": 4.4060892390342416e-05, "loss": 0.1158, "num_input_tokens_seen": 12885088, "step": 12795 }, { "epoch": 6.034889203206035, "grad_norm": 0.21618710458278656, "learning_rate": 4.405423507401934e-05, "loss": 0.1024, "num_input_tokens_seen": 12890080, "step": 12800 }, { "epoch": 6.037246581801037, "grad_norm": 0.9072019457817078, "learning_rate": 4.4047574532137794e-05, "loss": 0.2422, "num_input_tokens_seen": 12895616, "step": 12805 }, { "epoch": 6.03960396039604, "grad_norm": 1.3616188764572144, "learning_rate": 4.4040910765825284e-05, "loss": 0.2492, "num_input_tokens_seen": 12901152, "step": 12810 }, { "epoch": 6.041961338991042, "grad_norm": 0.4125811457633972, "learning_rate": 4.403424377620987e-05, "loss": 0.0547, "num_input_tokens_seen": 12906720, "step": 12815 }, { "epoch": 6.044318717586044, "grad_norm": 0.12162761390209198, "learning_rate": 4.402757356442018e-05, "loss": 0.0611, "num_input_tokens_seen": 12911168, "step": 12820 }, { "epoch": 6.046676096181047, "grad_norm": 0.49432533979415894, "learning_rate": 4.4020900131585346e-05, "loss": 0.1251, "num_input_tokens_seen": 12915168, "step": 12825 }, { "epoch": 6.049033474776049, "grad_norm": 0.8933297395706177, "learning_rate": 4.401422347883508e-05, "loss": 0.0928, "num_input_tokens_seen": 12920032, "step": 12830 }, { "epoch": 6.051390853371052, "grad_norm": 0.623494565486908, "learning_rate": 4.4007543607299624e-05, "loss": 0.0503, "num_input_tokens_seen": 12924736, "step": 12835 }, { "epoch": 6.053748231966054, "grad_norm": 0.17827290296554565, "learning_rate": 4.4000860518109764e-05, "loss": 0.1641, "num_input_tokens_seen": 12930624, "step": 12840 }, { "epoch": 6.0561056105610565, "grad_norm": 0.1876472383737564, "learning_rate": 4.3994174212396834e-05, "loss": 0.0121, "num_input_tokens_seen": 12935104, "step": 12845 }, { "epoch": 6.058462989156059, "grad_norm": 0.1877129077911377, "learning_rate": 4.3987484691292714e-05, "loss": 0.2282, "num_input_tokens_seen": 12939840, "step": 12850 }, { "epoch": 6.060820367751061, "grad_norm": 0.4390355944633484, "learning_rate": 4.398079195592982e-05, "loss": 0.0962, "num_input_tokens_seen": 12945280, "step": 12855 }, { "epoch": 6.063177746346063, "grad_norm": 0.038604486733675, "learning_rate": 4.397409600744112e-05, "loss": 0.0321, "num_input_tokens_seen": 12950208, "step": 12860 }, { "epoch": 6.065535124941065, "grad_norm": 2.3907763957977295, "learning_rate": 4.396739684696014e-05, "loss": 0.1206, "num_input_tokens_seen": 12954976, "step": 12865 }, { "epoch": 6.067892503536068, "grad_norm": 0.7463318705558777, "learning_rate": 4.396069447562092e-05, "loss": 0.2169, "num_input_tokens_seen": 12960032, "step": 12870 }, { "epoch": 6.07024988213107, "grad_norm": 0.11109860986471176, "learning_rate": 4.395398889455805e-05, "loss": 0.0576, "num_input_tokens_seen": 12965280, "step": 12875 }, { "epoch": 6.072607260726072, "grad_norm": 0.7190220952033997, "learning_rate": 4.394728010490669e-05, "loss": 0.2139, "num_input_tokens_seen": 12971552, "step": 12880 }, { "epoch": 6.074964639321075, "grad_norm": 1.1386299133300781, "learning_rate": 4.394056810780251e-05, "loss": 0.1114, "num_input_tokens_seen": 12976736, "step": 12885 }, { "epoch": 6.077322017916077, "grad_norm": 0.14565801620483398, "learning_rate": 4.3933852904381745e-05, "loss": 0.135, "num_input_tokens_seen": 12981408, "step": 12890 }, { "epoch": 6.07967939651108, "grad_norm": 1.4273720979690552, "learning_rate": 4.392713449578115e-05, "loss": 0.0579, "num_input_tokens_seen": 12985728, "step": 12895 }, { "epoch": 6.082036775106082, "grad_norm": 0.1131402850151062, "learning_rate": 4.3920412883138065e-05, "loss": 0.2383, "num_input_tokens_seen": 12991136, "step": 12900 }, { "epoch": 6.084394153701084, "grad_norm": 0.1027163416147232, "learning_rate": 4.391368806759032e-05, "loss": 0.1067, "num_input_tokens_seen": 12995424, "step": 12905 }, { "epoch": 6.086751532296087, "grad_norm": 0.5138659477233887, "learning_rate": 4.390696005027633e-05, "loss": 0.1425, "num_input_tokens_seen": 13000512, "step": 12910 }, { "epoch": 6.089108910891089, "grad_norm": 0.7005214095115662, "learning_rate": 4.390022883233502e-05, "loss": 0.2896, "num_input_tokens_seen": 13005216, "step": 12915 }, { "epoch": 6.091466289486092, "grad_norm": 0.06900987774133682, "learning_rate": 4.3893494414905875e-05, "loss": 0.2032, "num_input_tokens_seen": 13011552, "step": 12920 }, { "epoch": 6.093823668081094, "grad_norm": 0.5003913640975952, "learning_rate": 4.388675679912892e-05, "loss": 0.207, "num_input_tokens_seen": 13016960, "step": 12925 }, { "epoch": 6.096181046676096, "grad_norm": 0.9993194937705994, "learning_rate": 4.388001598614472e-05, "loss": 0.2386, "num_input_tokens_seen": 13022176, "step": 12930 }, { "epoch": 6.098538425271099, "grad_norm": 0.3611041009426117, "learning_rate": 4.3873271977094375e-05, "loss": 0.0853, "num_input_tokens_seen": 13027104, "step": 12935 }, { "epoch": 6.100895803866101, "grad_norm": 0.1272822469472885, "learning_rate": 4.3866524773119534e-05, "loss": 0.0788, "num_input_tokens_seen": 13031904, "step": 12940 }, { "epoch": 6.103253182461104, "grad_norm": 1.6399893760681152, "learning_rate": 4.3859774375362385e-05, "loss": 0.1412, "num_input_tokens_seen": 13037632, "step": 12945 }, { "epoch": 6.105610561056106, "grad_norm": 1.857873797416687, "learning_rate": 4.385302078496566e-05, "loss": 0.19, "num_input_tokens_seen": 13042720, "step": 12950 }, { "epoch": 6.107967939651108, "grad_norm": 0.3985179662704468, "learning_rate": 4.384626400307261e-05, "loss": 0.0755, "num_input_tokens_seen": 13047136, "step": 12955 }, { "epoch": 6.11032531824611, "grad_norm": 0.05631735175848007, "learning_rate": 4.383950403082707e-05, "loss": 0.1118, "num_input_tokens_seen": 13052032, "step": 12960 }, { "epoch": 6.112682696841112, "grad_norm": 0.25462818145751953, "learning_rate": 4.383274086937336e-05, "loss": 0.2095, "num_input_tokens_seen": 13058848, "step": 12965 }, { "epoch": 6.115040075436115, "grad_norm": 1.9058334827423096, "learning_rate": 4.3825974519856394e-05, "loss": 0.2395, "num_input_tokens_seen": 13062848, "step": 12970 }, { "epoch": 6.117397454031117, "grad_norm": 1.3902949094772339, "learning_rate": 4.381920498342158e-05, "loss": 0.1237, "num_input_tokens_seen": 13068352, "step": 12975 }, { "epoch": 6.1197548326261195, "grad_norm": 0.9087682962417603, "learning_rate": 4.38124322612149e-05, "loss": 0.1405, "num_input_tokens_seen": 13073536, "step": 12980 }, { "epoch": 6.122112211221122, "grad_norm": 0.724320113658905, "learning_rate": 4.380565635438286e-05, "loss": 0.0775, "num_input_tokens_seen": 13080352, "step": 12985 }, { "epoch": 6.124469589816124, "grad_norm": 0.09142310917377472, "learning_rate": 4.379887726407249e-05, "loss": 0.0685, "num_input_tokens_seen": 13085984, "step": 12990 }, { "epoch": 6.126826968411127, "grad_norm": 0.6169815063476562, "learning_rate": 4.379209499143139e-05, "loss": 0.1366, "num_input_tokens_seen": 13090560, "step": 12995 }, { "epoch": 6.129184347006129, "grad_norm": 0.2446606457233429, "learning_rate": 4.3785309537607685e-05, "loss": 0.0538, "num_input_tokens_seen": 13096192, "step": 13000 }, { "epoch": 6.1315417256011315, "grad_norm": 0.16419605910778046, "learning_rate": 4.3778520903750025e-05, "loss": 0.0311, "num_input_tokens_seen": 13101280, "step": 13005 }, { "epoch": 6.133899104196134, "grad_norm": 0.17879144847393036, "learning_rate": 4.377172909100762e-05, "loss": 0.187, "num_input_tokens_seen": 13108000, "step": 13010 }, { "epoch": 6.136256482791136, "grad_norm": 0.7999851107597351, "learning_rate": 4.3764934100530205e-05, "loss": 0.2459, "num_input_tokens_seen": 13113888, "step": 13015 }, { "epoch": 6.138613861386139, "grad_norm": 0.21218416094779968, "learning_rate": 4.375813593346806e-05, "loss": 0.1027, "num_input_tokens_seen": 13119552, "step": 13020 }, { "epoch": 6.140971239981141, "grad_norm": 1.0722018480300903, "learning_rate": 4.3751334590971996e-05, "loss": 0.1803, "num_input_tokens_seen": 13125280, "step": 13025 }, { "epoch": 6.1433286185761435, "grad_norm": 1.8585938215255737, "learning_rate": 4.374453007419336e-05, "loss": 0.2035, "num_input_tokens_seen": 13129856, "step": 13030 }, { "epoch": 6.145685997171146, "grad_norm": 0.3320162296295166, "learning_rate": 4.373772238428405e-05, "loss": 0.1973, "num_input_tokens_seen": 13134944, "step": 13035 }, { "epoch": 6.148043375766148, "grad_norm": 0.3130015730857849, "learning_rate": 4.3730911522396486e-05, "loss": 0.1713, "num_input_tokens_seen": 13139264, "step": 13040 }, { "epoch": 6.150400754361151, "grad_norm": 1.4410218000411987, "learning_rate": 4.3724097489683634e-05, "loss": 0.1492, "num_input_tokens_seen": 13144896, "step": 13045 }, { "epoch": 6.152758132956153, "grad_norm": 0.8576679229736328, "learning_rate": 4.3717280287299e-05, "loss": 0.069, "num_input_tokens_seen": 13150112, "step": 13050 }, { "epoch": 6.1551155115511555, "grad_norm": 0.8495700359344482, "learning_rate": 4.3710459916396606e-05, "loss": 0.0492, "num_input_tokens_seen": 13155008, "step": 13055 }, { "epoch": 6.157472890146157, "grad_norm": 0.4206821322441101, "learning_rate": 4.3703636378131044e-05, "loss": 0.1673, "num_input_tokens_seen": 13159584, "step": 13060 }, { "epoch": 6.1598302687411595, "grad_norm": 1.1911622285842896, "learning_rate": 4.36968096736574e-05, "loss": 0.2096, "num_input_tokens_seen": 13165536, "step": 13065 }, { "epoch": 6.162187647336162, "grad_norm": 1.1763609647750854, "learning_rate": 4.3689979804131344e-05, "loss": 0.0502, "num_input_tokens_seen": 13170336, "step": 13070 }, { "epoch": 6.164545025931164, "grad_norm": 0.8271948099136353, "learning_rate": 4.368314677070904e-05, "loss": 0.0843, "num_input_tokens_seen": 13175808, "step": 13075 }, { "epoch": 6.166902404526167, "grad_norm": 0.8868419528007507, "learning_rate": 4.3676310574547206e-05, "loss": 0.2368, "num_input_tokens_seen": 13180000, "step": 13080 }, { "epoch": 6.169259783121169, "grad_norm": 0.5692086815834045, "learning_rate": 4.3669471216803104e-05, "loss": 0.0844, "num_input_tokens_seen": 13185664, "step": 13085 }, { "epoch": 6.1716171617161715, "grad_norm": 2.3341336250305176, "learning_rate": 4.366262869863451e-05, "loss": 0.2274, "num_input_tokens_seen": 13189600, "step": 13090 }, { "epoch": 6.173974540311174, "grad_norm": 0.35150909423828125, "learning_rate": 4.365578302119976e-05, "loss": 0.0676, "num_input_tokens_seen": 13193824, "step": 13095 }, { "epoch": 6.176331918906176, "grad_norm": 1.060197114944458, "learning_rate": 4.3648934185657683e-05, "loss": 0.1641, "num_input_tokens_seen": 13198432, "step": 13100 }, { "epoch": 6.178689297501179, "grad_norm": 0.8939535617828369, "learning_rate": 4.364208219316771e-05, "loss": 0.0432, "num_input_tokens_seen": 13202240, "step": 13105 }, { "epoch": 6.181046676096181, "grad_norm": 1.0543315410614014, "learning_rate": 4.363522704488974e-05, "loss": 0.0811, "num_input_tokens_seen": 13208480, "step": 13110 }, { "epoch": 6.1834040546911835, "grad_norm": 0.6115830540657043, "learning_rate": 4.362836874198423e-05, "loss": 0.1869, "num_input_tokens_seen": 13213344, "step": 13115 }, { "epoch": 6.185761433286186, "grad_norm": 0.3473632335662842, "learning_rate": 4.362150728561219e-05, "loss": 0.1657, "num_input_tokens_seen": 13219200, "step": 13120 }, { "epoch": 6.188118811881188, "grad_norm": 0.021831389516592026, "learning_rate": 4.361464267693514e-05, "loss": 0.1473, "num_input_tokens_seen": 13224832, "step": 13125 }, { "epoch": 6.190476190476191, "grad_norm": 0.25614362955093384, "learning_rate": 4.360777491711515e-05, "loss": 0.0462, "num_input_tokens_seen": 13230272, "step": 13130 }, { "epoch": 6.192833569071193, "grad_norm": 0.06400343775749207, "learning_rate": 4.36009040073148e-05, "loss": 0.0543, "num_input_tokens_seen": 13235616, "step": 13135 }, { "epoch": 6.1951909476661955, "grad_norm": 0.2955912947654724, "learning_rate": 4.359402994869723e-05, "loss": 0.1022, "num_input_tokens_seen": 13239840, "step": 13140 }, { "epoch": 6.197548326261198, "grad_norm": 0.46068790555000305, "learning_rate": 4.3587152742426096e-05, "loss": 0.0454, "num_input_tokens_seen": 13244480, "step": 13145 }, { "epoch": 6.1999057048562, "grad_norm": 0.7062370181083679, "learning_rate": 4.35802723896656e-05, "loss": 0.0729, "num_input_tokens_seen": 13248800, "step": 13150 }, { "epoch": 6.202263083451202, "grad_norm": 1.6223000288009644, "learning_rate": 4.357338889158046e-05, "loss": 0.1516, "num_input_tokens_seen": 13253056, "step": 13155 }, { "epoch": 6.204620462046204, "grad_norm": 0.03644903376698494, "learning_rate": 4.356650224933594e-05, "loss": 0.0388, "num_input_tokens_seen": 13258016, "step": 13160 }, { "epoch": 6.206977840641207, "grad_norm": 0.6847310662269592, "learning_rate": 4.355961246409783e-05, "loss": 0.1861, "num_input_tokens_seen": 13263328, "step": 13165 }, { "epoch": 6.209335219236209, "grad_norm": 2.195383310317993, "learning_rate": 4.355271953703246e-05, "loss": 0.2372, "num_input_tokens_seen": 13267040, "step": 13170 }, { "epoch": 6.211692597831211, "grad_norm": 0.39023545384407043, "learning_rate": 4.354582346930667e-05, "loss": 0.1847, "num_input_tokens_seen": 13271648, "step": 13175 }, { "epoch": 6.214049976426214, "grad_norm": 1.1791436672210693, "learning_rate": 4.353892426208787e-05, "loss": 0.164, "num_input_tokens_seen": 13276384, "step": 13180 }, { "epoch": 6.216407355021216, "grad_norm": 0.14090628921985626, "learning_rate": 4.353202191654395e-05, "loss": 0.0901, "num_input_tokens_seen": 13281120, "step": 13185 }, { "epoch": 6.218764733616219, "grad_norm": 0.024587472900748253, "learning_rate": 4.352511643384338e-05, "loss": 0.0439, "num_input_tokens_seen": 13285504, "step": 13190 }, { "epoch": 6.221122112211221, "grad_norm": 0.5017831921577454, "learning_rate": 4.351820781515514e-05, "loss": 0.0674, "num_input_tokens_seen": 13290048, "step": 13195 }, { "epoch": 6.223479490806223, "grad_norm": 2.0863187313079834, "learning_rate": 4.351129606164873e-05, "loss": 0.2886, "num_input_tokens_seen": 13294784, "step": 13200 }, { "epoch": 6.225836869401226, "grad_norm": 0.2940298020839691, "learning_rate": 4.350438117449421e-05, "loss": 0.1274, "num_input_tokens_seen": 13300064, "step": 13205 }, { "epoch": 6.228194247996228, "grad_norm": 0.1462378352880478, "learning_rate": 4.349746315486214e-05, "loss": 0.2172, "num_input_tokens_seen": 13304000, "step": 13210 }, { "epoch": 6.230551626591231, "grad_norm": 0.2510366439819336, "learning_rate": 4.349054200392363e-05, "loss": 0.0512, "num_input_tokens_seen": 13309696, "step": 13215 }, { "epoch": 6.232909005186233, "grad_norm": 1.5988445281982422, "learning_rate": 4.3483617722850305e-05, "loss": 0.1987, "num_input_tokens_seen": 13313952, "step": 13220 }, { "epoch": 6.235266383781235, "grad_norm": 0.11764737218618393, "learning_rate": 4.347669031281434e-05, "loss": 0.1013, "num_input_tokens_seen": 13318592, "step": 13225 }, { "epoch": 6.237623762376238, "grad_norm": 1.2337833642959595, "learning_rate": 4.3469759774988414e-05, "loss": 0.2251, "num_input_tokens_seen": 13322720, "step": 13230 }, { "epoch": 6.23998114097124, "grad_norm": 0.07907611131668091, "learning_rate": 4.346282611054576e-05, "loss": 0.0614, "num_input_tokens_seen": 13329344, "step": 13235 }, { "epoch": 6.242338519566243, "grad_norm": 0.7428401112556458, "learning_rate": 4.3455889320660126e-05, "loss": 0.1025, "num_input_tokens_seen": 13335136, "step": 13240 }, { "epoch": 6.244695898161245, "grad_norm": 0.22178499400615692, "learning_rate": 4.344894940650579e-05, "loss": 0.1194, "num_input_tokens_seen": 13340736, "step": 13245 }, { "epoch": 6.247053276756247, "grad_norm": 1.0463966131210327, "learning_rate": 4.3442006369257574e-05, "loss": 0.1394, "num_input_tokens_seen": 13347072, "step": 13250 }, { "epoch": 6.24941065535125, "grad_norm": 0.8757654428482056, "learning_rate": 4.3435060210090806e-05, "loss": 0.3412, "num_input_tokens_seen": 13352480, "step": 13255 }, { "epoch": 6.251768033946251, "grad_norm": 1.8752378225326538, "learning_rate": 4.342811093018134e-05, "loss": 0.2391, "num_input_tokens_seen": 13356768, "step": 13260 }, { "epoch": 6.254125412541254, "grad_norm": 0.13149668276309967, "learning_rate": 4.34211585307056e-05, "loss": 0.0454, "num_input_tokens_seen": 13360896, "step": 13265 }, { "epoch": 6.256482791136256, "grad_norm": 1.4837781190872192, "learning_rate": 4.3414203012840494e-05, "loss": 0.1915, "num_input_tokens_seen": 13366272, "step": 13270 }, { "epoch": 6.258840169731259, "grad_norm": 0.1670607030391693, "learning_rate": 4.340724437776347e-05, "loss": 0.0949, "num_input_tokens_seen": 13370464, "step": 13275 }, { "epoch": 6.261197548326261, "grad_norm": 0.2606348693370819, "learning_rate": 4.34002826266525e-05, "loss": 0.1384, "num_input_tokens_seen": 13375424, "step": 13280 }, { "epoch": 6.263554926921263, "grad_norm": 0.9272729754447937, "learning_rate": 4.3393317760686114e-05, "loss": 0.1233, "num_input_tokens_seen": 13380736, "step": 13285 }, { "epoch": 6.265912305516266, "grad_norm": 0.1890728622674942, "learning_rate": 4.338634978104332e-05, "loss": 0.0814, "num_input_tokens_seen": 13385088, "step": 13290 }, { "epoch": 6.268269684111268, "grad_norm": 1.5864146947860718, "learning_rate": 4.337937868890371e-05, "loss": 0.1475, "num_input_tokens_seen": 13390208, "step": 13295 }, { "epoch": 6.270627062706271, "grad_norm": 1.6942952871322632, "learning_rate": 4.3372404485447324e-05, "loss": 0.276, "num_input_tokens_seen": 13395488, "step": 13300 }, { "epoch": 6.272984441301273, "grad_norm": 0.22556038200855255, "learning_rate": 4.3365427171854823e-05, "loss": 0.1008, "num_input_tokens_seen": 13400608, "step": 13305 }, { "epoch": 6.275341819896275, "grad_norm": 0.11290030926465988, "learning_rate": 4.335844674930733e-05, "loss": 0.2154, "num_input_tokens_seen": 13406784, "step": 13310 }, { "epoch": 6.277699198491278, "grad_norm": 2.1096811294555664, "learning_rate": 4.335146321898651e-05, "loss": 0.2446, "num_input_tokens_seen": 13412032, "step": 13315 }, { "epoch": 6.28005657708628, "grad_norm": 1.2650243043899536, "learning_rate": 4.334447658207455e-05, "loss": 0.138, "num_input_tokens_seen": 13420768, "step": 13320 }, { "epoch": 6.282413955681283, "grad_norm": 1.3806613683700562, "learning_rate": 4.333748683975418e-05, "loss": 0.139, "num_input_tokens_seen": 13424768, "step": 13325 }, { "epoch": 6.284771334276285, "grad_norm": 0.12439437955617905, "learning_rate": 4.3330493993208644e-05, "loss": 0.1426, "num_input_tokens_seen": 13428800, "step": 13330 }, { "epoch": 6.287128712871287, "grad_norm": 1.3350887298583984, "learning_rate": 4.33234980436217e-05, "loss": 0.1614, "num_input_tokens_seen": 13433344, "step": 13335 }, { "epoch": 6.28948609146629, "grad_norm": 0.5944260358810425, "learning_rate": 4.331649899217766e-05, "loss": 0.1867, "num_input_tokens_seen": 13438272, "step": 13340 }, { "epoch": 6.291843470061292, "grad_norm": 0.10832259058952332, "learning_rate": 4.3309496840061336e-05, "loss": 0.0294, "num_input_tokens_seen": 13443104, "step": 13345 }, { "epoch": 6.294200848656295, "grad_norm": 0.6393519639968872, "learning_rate": 4.330249158845807e-05, "loss": 0.0952, "num_input_tokens_seen": 13449152, "step": 13350 }, { "epoch": 6.296558227251296, "grad_norm": 0.3354465067386627, "learning_rate": 4.3295483238553744e-05, "loss": 0.1473, "num_input_tokens_seen": 13454112, "step": 13355 }, { "epoch": 6.2989156058462985, "grad_norm": 0.39570948481559753, "learning_rate": 4.3288471791534743e-05, "loss": 0.1615, "num_input_tokens_seen": 13459424, "step": 13360 }, { "epoch": 6.301272984441301, "grad_norm": 0.21195049583911896, "learning_rate": 4.3281457248587986e-05, "loss": 0.0686, "num_input_tokens_seen": 13464160, "step": 13365 }, { "epoch": 6.303630363036303, "grad_norm": 0.7997726798057556, "learning_rate": 4.327443961090092e-05, "loss": 0.1156, "num_input_tokens_seen": 13468704, "step": 13370 }, { "epoch": 6.305987741631306, "grad_norm": 0.30255433917045593, "learning_rate": 4.3267418879661504e-05, "loss": 0.1974, "num_input_tokens_seen": 13473184, "step": 13375 }, { "epoch": 6.308345120226308, "grad_norm": 1.0049285888671875, "learning_rate": 4.326039505605824e-05, "loss": 0.137, "num_input_tokens_seen": 13477824, "step": 13380 }, { "epoch": 6.3107024988213105, "grad_norm": 0.668661892414093, "learning_rate": 4.3253368141280136e-05, "loss": 0.2264, "num_input_tokens_seen": 13484288, "step": 13385 }, { "epoch": 6.313059877416313, "grad_norm": 2.2497448921203613, "learning_rate": 4.324633813651674e-05, "loss": 0.0665, "num_input_tokens_seen": 13489312, "step": 13390 }, { "epoch": 6.315417256011315, "grad_norm": 2.074423313140869, "learning_rate": 4.323930504295809e-05, "loss": 0.2132, "num_input_tokens_seen": 13495040, "step": 13395 }, { "epoch": 6.317774634606318, "grad_norm": 0.37215113639831543, "learning_rate": 4.323226886179479e-05, "loss": 0.0999, "num_input_tokens_seen": 13499616, "step": 13400 }, { "epoch": 6.32013201320132, "grad_norm": 0.9244139194488525, "learning_rate": 4.322522959421793e-05, "loss": 0.0503, "num_input_tokens_seen": 13503616, "step": 13405 }, { "epoch": 6.3224893917963225, "grad_norm": 0.2163153439760208, "learning_rate": 4.321818724141915e-05, "loss": 0.0511, "num_input_tokens_seen": 13509440, "step": 13410 }, { "epoch": 6.324846770391325, "grad_norm": 0.9721001386642456, "learning_rate": 4.3211141804590595e-05, "loss": 0.1919, "num_input_tokens_seen": 13514752, "step": 13415 }, { "epoch": 6.327204148986327, "grad_norm": 1.5455844402313232, "learning_rate": 4.3204093284924946e-05, "loss": 0.0873, "num_input_tokens_seen": 13519360, "step": 13420 }, { "epoch": 6.32956152758133, "grad_norm": 0.15068040788173676, "learning_rate": 4.3197041683615394e-05, "loss": 0.1096, "num_input_tokens_seen": 13525120, "step": 13425 }, { "epoch": 6.331918906176332, "grad_norm": 0.5630671381950378, "learning_rate": 4.318998700185565e-05, "loss": 0.0589, "num_input_tokens_seen": 13530208, "step": 13430 }, { "epoch": 6.3342762847713345, "grad_norm": 1.8230490684509277, "learning_rate": 4.318292924083994e-05, "loss": 0.2136, "num_input_tokens_seen": 13535328, "step": 13435 }, { "epoch": 6.336633663366337, "grad_norm": 0.7024107575416565, "learning_rate": 4.3175868401763055e-05, "loss": 0.1415, "num_input_tokens_seen": 13539936, "step": 13440 }, { "epoch": 6.338991041961339, "grad_norm": 0.7581822276115417, "learning_rate": 4.3168804485820257e-05, "loss": 0.1027, "num_input_tokens_seen": 13544448, "step": 13445 }, { "epoch": 6.341348420556342, "grad_norm": 0.7390672564506531, "learning_rate": 4.316173749420734e-05, "loss": 0.072, "num_input_tokens_seen": 13550336, "step": 13450 }, { "epoch": 6.343705799151344, "grad_norm": 2.8641867637634277, "learning_rate": 4.3154667428120645e-05, "loss": 0.2402, "num_input_tokens_seen": 13556480, "step": 13455 }, { "epoch": 6.346063177746346, "grad_norm": 1.7256988286972046, "learning_rate": 4.3147594288757e-05, "loss": 0.2835, "num_input_tokens_seen": 13562752, "step": 13460 }, { "epoch": 6.348420556341348, "grad_norm": 0.3725242614746094, "learning_rate": 4.314051807731376e-05, "loss": 0.1313, "num_input_tokens_seen": 13566944, "step": 13465 }, { "epoch": 6.3507779349363505, "grad_norm": 0.9772306680679321, "learning_rate": 4.313343879498884e-05, "loss": 0.1627, "num_input_tokens_seen": 13571232, "step": 13470 }, { "epoch": 6.353135313531353, "grad_norm": 0.4716961085796356, "learning_rate": 4.31263564429806e-05, "loss": 0.3316, "num_input_tokens_seen": 13576544, "step": 13475 }, { "epoch": 6.355492692126355, "grad_norm": 0.010967363603413105, "learning_rate": 4.3119271022487986e-05, "loss": 0.0615, "num_input_tokens_seen": 13580800, "step": 13480 }, { "epoch": 6.357850070721358, "grad_norm": 0.38136959075927734, "learning_rate": 4.311218253471044e-05, "loss": 0.2381, "num_input_tokens_seen": 13587040, "step": 13485 }, { "epoch": 6.36020744931636, "grad_norm": 0.18842461705207825, "learning_rate": 4.310509098084791e-05, "loss": 0.1173, "num_input_tokens_seen": 13591424, "step": 13490 }, { "epoch": 6.3625648279113625, "grad_norm": 0.16574156284332275, "learning_rate": 4.309799636210089e-05, "loss": 0.0627, "num_input_tokens_seen": 13596992, "step": 13495 }, { "epoch": 6.364922206506365, "grad_norm": 0.3481943607330322, "learning_rate": 4.3090898679670376e-05, "loss": 0.0725, "num_input_tokens_seen": 13602144, "step": 13500 }, { "epoch": 6.367279585101367, "grad_norm": 0.6574268937110901, "learning_rate": 4.3083797934757874e-05, "loss": 0.0588, "num_input_tokens_seen": 13606528, "step": 13505 }, { "epoch": 6.36963696369637, "grad_norm": 0.5798026323318481, "learning_rate": 4.307669412856543e-05, "loss": 0.0826, "num_input_tokens_seen": 13610624, "step": 13510 }, { "epoch": 6.371994342291372, "grad_norm": 0.4564819633960724, "learning_rate": 4.30695872622956e-05, "loss": 0.0506, "num_input_tokens_seen": 13616256, "step": 13515 }, { "epoch": 6.3743517208863745, "grad_norm": 1.1204088926315308, "learning_rate": 4.306247733715145e-05, "loss": 0.1254, "num_input_tokens_seen": 13620832, "step": 13520 }, { "epoch": 6.376709099481377, "grad_norm": 0.2794356644153595, "learning_rate": 4.3055364354336565e-05, "loss": 0.0796, "num_input_tokens_seen": 13625664, "step": 13525 }, { "epoch": 6.379066478076379, "grad_norm": 0.14073187112808228, "learning_rate": 4.3048248315055054e-05, "loss": 0.1228, "num_input_tokens_seen": 13630592, "step": 13530 }, { "epoch": 6.381423856671382, "grad_norm": 1.2283053398132324, "learning_rate": 4.304112922051155e-05, "loss": 0.1004, "num_input_tokens_seen": 13634336, "step": 13535 }, { "epoch": 6.383781235266384, "grad_norm": 0.1689547598361969, "learning_rate": 4.30340070719112e-05, "loss": 0.145, "num_input_tokens_seen": 13639552, "step": 13540 }, { "epoch": 6.3861386138613865, "grad_norm": 1.0414763689041138, "learning_rate": 4.302688187045964e-05, "loss": 0.0723, "num_input_tokens_seen": 13645536, "step": 13545 }, { "epoch": 6.388495992456389, "grad_norm": 1.503474473953247, "learning_rate": 4.3019753617363056e-05, "loss": 0.1025, "num_input_tokens_seen": 13650144, "step": 13550 }, { "epoch": 6.39085337105139, "grad_norm": 1.0158157348632812, "learning_rate": 4.3012622313828156e-05, "loss": 0.0368, "num_input_tokens_seen": 13655392, "step": 13555 }, { "epoch": 6.393210749646393, "grad_norm": 0.34446191787719727, "learning_rate": 4.3005487961062126e-05, "loss": 0.1528, "num_input_tokens_seen": 13660864, "step": 13560 }, { "epoch": 6.395568128241395, "grad_norm": 1.5598254203796387, "learning_rate": 4.2998350560272696e-05, "loss": 0.2331, "num_input_tokens_seen": 13665888, "step": 13565 }, { "epoch": 6.397925506836398, "grad_norm": 0.4659784734249115, "learning_rate": 4.299121011266812e-05, "loss": 0.0902, "num_input_tokens_seen": 13670112, "step": 13570 }, { "epoch": 6.4002828854314, "grad_norm": 1.6238001585006714, "learning_rate": 4.2984066619457144e-05, "loss": 0.3347, "num_input_tokens_seen": 13674464, "step": 13575 }, { "epoch": 6.402640264026402, "grad_norm": 0.47190341353416443, "learning_rate": 4.297692008184904e-05, "loss": 0.2435, "num_input_tokens_seen": 13679648, "step": 13580 }, { "epoch": 6.404997642621405, "grad_norm": 0.6987339854240417, "learning_rate": 4.2969770501053586e-05, "loss": 0.245, "num_input_tokens_seen": 13684320, "step": 13585 }, { "epoch": 6.407355021216407, "grad_norm": 0.12030137330293655, "learning_rate": 4.296261787828111e-05, "loss": 0.2335, "num_input_tokens_seen": 13688576, "step": 13590 }, { "epoch": 6.40971239981141, "grad_norm": 0.6291443109512329, "learning_rate": 4.295546221474241e-05, "loss": 0.0962, "num_input_tokens_seen": 13693760, "step": 13595 }, { "epoch": 6.412069778406412, "grad_norm": 1.1687309741973877, "learning_rate": 4.294830351164883e-05, "loss": 0.0594, "num_input_tokens_seen": 13698112, "step": 13600 }, { "epoch": 6.414427157001414, "grad_norm": 0.3711082339286804, "learning_rate": 4.2941141770212195e-05, "loss": 0.1269, "num_input_tokens_seen": 13702528, "step": 13605 }, { "epoch": 6.416784535596417, "grad_norm": 0.07780081778764725, "learning_rate": 4.293397699164489e-05, "loss": 0.1064, "num_input_tokens_seen": 13707520, "step": 13610 }, { "epoch": 6.419141914191419, "grad_norm": 0.7072486281394958, "learning_rate": 4.292680917715978e-05, "loss": 0.1101, "num_input_tokens_seen": 13712672, "step": 13615 }, { "epoch": 6.421499292786422, "grad_norm": 0.3511371612548828, "learning_rate": 4.2919638327970255e-05, "loss": 0.0678, "num_input_tokens_seen": 13717440, "step": 13620 }, { "epoch": 6.423856671381424, "grad_norm": 0.37013930082321167, "learning_rate": 4.291246444529023e-05, "loss": 0.153, "num_input_tokens_seen": 13722272, "step": 13625 }, { "epoch": 6.426214049976426, "grad_norm": 1.1435604095458984, "learning_rate": 4.2905287530334106e-05, "loss": 0.2145, "num_input_tokens_seen": 13727040, "step": 13630 }, { "epoch": 6.428571428571429, "grad_norm": 0.02390347421169281, "learning_rate": 4.289810758431681e-05, "loss": 0.0688, "num_input_tokens_seen": 13731872, "step": 13635 }, { "epoch": 6.430928807166431, "grad_norm": 0.3289715349674225, "learning_rate": 4.28909246084538e-05, "loss": 0.1453, "num_input_tokens_seen": 13737216, "step": 13640 }, { "epoch": 6.433286185761434, "grad_norm": 0.20694267749786377, "learning_rate": 4.288373860396102e-05, "loss": 0.1064, "num_input_tokens_seen": 13741888, "step": 13645 }, { "epoch": 6.435643564356436, "grad_norm": 0.25794366002082825, "learning_rate": 4.2876549572054945e-05, "loss": 0.2703, "num_input_tokens_seen": 13746944, "step": 13650 }, { "epoch": 6.438000942951438, "grad_norm": 0.551817774772644, "learning_rate": 4.286935751395256e-05, "loss": 0.1859, "num_input_tokens_seen": 13752256, "step": 13655 }, { "epoch": 6.44035832154644, "grad_norm": 0.13986842334270477, "learning_rate": 4.286216243087134e-05, "loss": 0.0688, "num_input_tokens_seen": 13757888, "step": 13660 }, { "epoch": 6.442715700141442, "grad_norm": 0.5792155265808105, "learning_rate": 4.2854964324029314e-05, "loss": 0.1229, "num_input_tokens_seen": 13762880, "step": 13665 }, { "epoch": 6.445073078736445, "grad_norm": 0.8995770812034607, "learning_rate": 4.284776319464498e-05, "loss": 0.197, "num_input_tokens_seen": 13768448, "step": 13670 }, { "epoch": 6.447430457331447, "grad_norm": 0.4884309470653534, "learning_rate": 4.284055904393738e-05, "loss": 0.1819, "num_input_tokens_seen": 13772448, "step": 13675 }, { "epoch": 6.4497878359264496, "grad_norm": 0.10699789226055145, "learning_rate": 4.283335187312604e-05, "loss": 0.0291, "num_input_tokens_seen": 13777632, "step": 13680 }, { "epoch": 6.452145214521452, "grad_norm": 0.24259445071220398, "learning_rate": 4.282614168343103e-05, "loss": 0.1056, "num_input_tokens_seen": 13783296, "step": 13685 }, { "epoch": 6.454502593116454, "grad_norm": 0.13562139868736267, "learning_rate": 4.2818928476072904e-05, "loss": 0.0513, "num_input_tokens_seen": 13788256, "step": 13690 }, { "epoch": 6.456859971711457, "grad_norm": 0.49085289239883423, "learning_rate": 4.2811712252272726e-05, "loss": 0.2004, "num_input_tokens_seen": 13794112, "step": 13695 }, { "epoch": 6.459217350306459, "grad_norm": 0.3391765058040619, "learning_rate": 4.28044930132521e-05, "loss": 0.1988, "num_input_tokens_seen": 13798880, "step": 13700 }, { "epoch": 6.461574728901462, "grad_norm": 0.5532790422439575, "learning_rate": 4.279727076023311e-05, "loss": 0.1004, "num_input_tokens_seen": 13802656, "step": 13705 }, { "epoch": 6.463932107496464, "grad_norm": 1.5010969638824463, "learning_rate": 4.279004549443836e-05, "loss": 0.2552, "num_input_tokens_seen": 13807328, "step": 13710 }, { "epoch": 6.466289486091466, "grad_norm": 2.044675350189209, "learning_rate": 4.278281721709096e-05, "loss": 0.1774, "num_input_tokens_seen": 13811872, "step": 13715 }, { "epoch": 6.468646864686469, "grad_norm": 0.19913195073604584, "learning_rate": 4.277558592941454e-05, "loss": 0.1233, "num_input_tokens_seen": 13817760, "step": 13720 }, { "epoch": 6.471004243281471, "grad_norm": 0.12107834219932556, "learning_rate": 4.276835163263324e-05, "loss": 0.1402, "num_input_tokens_seen": 13823648, "step": 13725 }, { "epoch": 6.473361621876474, "grad_norm": 0.10581047832965851, "learning_rate": 4.276111432797169e-05, "loss": 0.0521, "num_input_tokens_seen": 13829696, "step": 13730 }, { "epoch": 6.475719000471476, "grad_norm": 0.16905538737773895, "learning_rate": 4.275387401665506e-05, "loss": 0.1808, "num_input_tokens_seen": 13835200, "step": 13735 }, { "epoch": 6.478076379066478, "grad_norm": 0.2977181077003479, "learning_rate": 4.274663069990899e-05, "loss": 0.0446, "num_input_tokens_seen": 13839424, "step": 13740 }, { "epoch": 6.480433757661481, "grad_norm": 0.08616025745868683, "learning_rate": 4.273938437895968e-05, "loss": 0.1073, "num_input_tokens_seen": 13843840, "step": 13745 }, { "epoch": 6.482791136256483, "grad_norm": 0.060162827372550964, "learning_rate": 4.273213505503378e-05, "loss": 0.121, "num_input_tokens_seen": 13848608, "step": 13750 }, { "epoch": 6.485148514851485, "grad_norm": 0.09945767372846603, "learning_rate": 4.2724882729358494e-05, "loss": 0.2319, "num_input_tokens_seen": 13853856, "step": 13755 }, { "epoch": 6.487505893446487, "grad_norm": 1.0430008172988892, "learning_rate": 4.271762740316152e-05, "loss": 0.2215, "num_input_tokens_seen": 13858784, "step": 13760 }, { "epoch": 6.4898632720414895, "grad_norm": 0.28955671191215515, "learning_rate": 4.271036907767104e-05, "loss": 0.071, "num_input_tokens_seen": 13863808, "step": 13765 }, { "epoch": 6.492220650636492, "grad_norm": 0.2045263797044754, "learning_rate": 4.270310775411579e-05, "loss": 0.1346, "num_input_tokens_seen": 13869312, "step": 13770 }, { "epoch": 6.494578029231494, "grad_norm": 1.2509713172912598, "learning_rate": 4.269584343372498e-05, "loss": 0.2155, "num_input_tokens_seen": 13875136, "step": 13775 }, { "epoch": 6.496935407826497, "grad_norm": 1.152185082435608, "learning_rate": 4.268857611772833e-05, "loss": 0.1034, "num_input_tokens_seen": 13879680, "step": 13780 }, { "epoch": 6.499292786421499, "grad_norm": 0.1236402690410614, "learning_rate": 4.268130580735608e-05, "loss": 0.0526, "num_input_tokens_seen": 13884576, "step": 13785 }, { "epoch": 6.5016501650165015, "grad_norm": 0.2614073157310486, "learning_rate": 4.267403250383897e-05, "loss": 0.1368, "num_input_tokens_seen": 13889216, "step": 13790 }, { "epoch": 6.504007543611504, "grad_norm": 0.6037654280662537, "learning_rate": 4.2666756208408244e-05, "loss": 0.0429, "num_input_tokens_seen": 13894432, "step": 13795 }, { "epoch": 6.506364922206506, "grad_norm": 0.9054746031761169, "learning_rate": 4.265947692229565e-05, "loss": 0.1714, "num_input_tokens_seen": 13899936, "step": 13800 }, { "epoch": 6.508722300801509, "grad_norm": 1.230857253074646, "learning_rate": 4.265219464673346e-05, "loss": 0.0994, "num_input_tokens_seen": 13904960, "step": 13805 }, { "epoch": 6.511079679396511, "grad_norm": 0.3514794111251831, "learning_rate": 4.264490938295444e-05, "loss": 0.123, "num_input_tokens_seen": 13909728, "step": 13810 }, { "epoch": 6.5134370579915135, "grad_norm": 0.12496116757392883, "learning_rate": 4.263762113219184e-05, "loss": 0.1508, "num_input_tokens_seen": 13914080, "step": 13815 }, { "epoch": 6.515794436586516, "grad_norm": 0.7288587689399719, "learning_rate": 4.263032989567947e-05, "loss": 0.1122, "num_input_tokens_seen": 13918272, "step": 13820 }, { "epoch": 6.518151815181518, "grad_norm": 0.8553425073623657, "learning_rate": 4.262303567465158e-05, "loss": 0.0821, "num_input_tokens_seen": 13923360, "step": 13825 }, { "epoch": 6.520509193776521, "grad_norm": 0.3375836908817291, "learning_rate": 4.261573847034297e-05, "loss": 0.2089, "num_input_tokens_seen": 13928160, "step": 13830 }, { "epoch": 6.522866572371523, "grad_norm": 0.31308886408805847, "learning_rate": 4.260843828398895e-05, "loss": 0.2039, "num_input_tokens_seen": 13933472, "step": 13835 }, { "epoch": 6.5252239509665255, "grad_norm": 4.064558982849121, "learning_rate": 4.260113511682529e-05, "loss": 0.0785, "num_input_tokens_seen": 13938272, "step": 13840 }, { "epoch": 6.527581329561528, "grad_norm": 0.38407185673713684, "learning_rate": 4.259382897008831e-05, "loss": 0.0958, "num_input_tokens_seen": 13943584, "step": 13845 }, { "epoch": 6.52993870815653, "grad_norm": 0.09535133093595505, "learning_rate": 4.2586519845014815e-05, "loss": 0.0778, "num_input_tokens_seen": 13948992, "step": 13850 }, { "epoch": 6.532296086751533, "grad_norm": 1.0956743955612183, "learning_rate": 4.257920774284211e-05, "loss": 0.1301, "num_input_tokens_seen": 13953600, "step": 13855 }, { "epoch": 6.534653465346535, "grad_norm": 0.13947159051895142, "learning_rate": 4.257189266480801e-05, "loss": 0.1164, "num_input_tokens_seen": 13958400, "step": 13860 }, { "epoch": 6.537010843941537, "grad_norm": 0.08856464922428131, "learning_rate": 4.256457461215084e-05, "loss": 0.1852, "num_input_tokens_seen": 13964608, "step": 13865 }, { "epoch": 6.539368222536539, "grad_norm": 0.29342058300971985, "learning_rate": 4.255725358610941e-05, "loss": 0.0602, "num_input_tokens_seen": 13968896, "step": 13870 }, { "epoch": 6.5417256011315414, "grad_norm": 0.2700929343700409, "learning_rate": 4.254992958792306e-05, "loss": 0.1396, "num_input_tokens_seen": 13974272, "step": 13875 }, { "epoch": 6.544082979726544, "grad_norm": 1.9777685403823853, "learning_rate": 4.2542602618831606e-05, "loss": 0.1315, "num_input_tokens_seen": 13979424, "step": 13880 }, { "epoch": 6.546440358321546, "grad_norm": 0.14343731105327606, "learning_rate": 4.253527268007539e-05, "loss": 0.2209, "num_input_tokens_seen": 13983584, "step": 13885 }, { "epoch": 6.548797736916549, "grad_norm": 1.539917230606079, "learning_rate": 4.252793977289524e-05, "loss": 0.0883, "num_input_tokens_seen": 13988288, "step": 13890 }, { "epoch": 6.551155115511551, "grad_norm": 2.1153807640075684, "learning_rate": 4.25206038985325e-05, "loss": 0.2017, "num_input_tokens_seen": 13992800, "step": 13895 }, { "epoch": 6.5535124941065535, "grad_norm": 0.08696555346250534, "learning_rate": 4.251326505822899e-05, "loss": 0.0683, "num_input_tokens_seen": 13999936, "step": 13900 }, { "epoch": 6.555869872701556, "grad_norm": 0.15792420506477356, "learning_rate": 4.250592325322707e-05, "loss": 0.1017, "num_input_tokens_seen": 14003968, "step": 13905 }, { "epoch": 6.558227251296558, "grad_norm": 0.0860198438167572, "learning_rate": 4.249857848476958e-05, "loss": 0.2428, "num_input_tokens_seen": 14009120, "step": 13910 }, { "epoch": 6.560584629891561, "grad_norm": 0.019164463505148888, "learning_rate": 4.249123075409985e-05, "loss": 0.2009, "num_input_tokens_seen": 14014080, "step": 13915 }, { "epoch": 6.562942008486563, "grad_norm": 0.9407402276992798, "learning_rate": 4.248388006246175e-05, "loss": 0.1001, "num_input_tokens_seen": 14020128, "step": 13920 }, { "epoch": 6.5652993870815655, "grad_norm": 1.6037836074829102, "learning_rate": 4.247652641109962e-05, "loss": 0.2345, "num_input_tokens_seen": 14024800, "step": 13925 }, { "epoch": 6.567656765676568, "grad_norm": 0.4290032982826233, "learning_rate": 4.246916980125829e-05, "loss": 0.1527, "num_input_tokens_seen": 14030496, "step": 13930 }, { "epoch": 6.57001414427157, "grad_norm": 0.8107871413230896, "learning_rate": 4.246181023418312e-05, "loss": 0.0436, "num_input_tokens_seen": 14035872, "step": 13935 }, { "epoch": 6.572371522866573, "grad_norm": 0.9150046706199646, "learning_rate": 4.245444771111997e-05, "loss": 0.1761, "num_input_tokens_seen": 14040416, "step": 13940 }, { "epoch": 6.574728901461575, "grad_norm": 0.13870880007743835, "learning_rate": 4.2447082233315184e-05, "loss": 0.0328, "num_input_tokens_seen": 14045760, "step": 13945 }, { "epoch": 6.5770862800565775, "grad_norm": 0.8835760951042175, "learning_rate": 4.24397138020156e-05, "loss": 0.0715, "num_input_tokens_seen": 14051392, "step": 13950 }, { "epoch": 6.579443658651579, "grad_norm": 0.35191819071769714, "learning_rate": 4.243234241846858e-05, "loss": 0.0628, "num_input_tokens_seen": 14055872, "step": 13955 }, { "epoch": 6.581801037246581, "grad_norm": 1.9179325103759766, "learning_rate": 4.242496808392199e-05, "loss": 0.2508, "num_input_tokens_seen": 14060608, "step": 13960 }, { "epoch": 6.584158415841584, "grad_norm": 1.3513000011444092, "learning_rate": 4.241759079962415e-05, "loss": 0.1753, "num_input_tokens_seen": 14065600, "step": 13965 }, { "epoch": 6.586515794436586, "grad_norm": 0.5274377465248108, "learning_rate": 4.241021056682393e-05, "loss": 0.0568, "num_input_tokens_seen": 14071008, "step": 13970 }, { "epoch": 6.588873173031589, "grad_norm": 0.2823733687400818, "learning_rate": 4.240282738677066e-05, "loss": 0.1101, "num_input_tokens_seen": 14075648, "step": 13975 }, { "epoch": 6.591230551626591, "grad_norm": 1.4056997299194336, "learning_rate": 4.2395441260714196e-05, "loss": 0.1578, "num_input_tokens_seen": 14080960, "step": 13980 }, { "epoch": 6.593587930221593, "grad_norm": 0.9327796101570129, "learning_rate": 4.2388052189904885e-05, "loss": 0.1027, "num_input_tokens_seen": 14085632, "step": 13985 }, { "epoch": 6.595945308816596, "grad_norm": 2.045250654220581, "learning_rate": 4.238066017559357e-05, "loss": 0.1665, "num_input_tokens_seen": 14090400, "step": 13990 }, { "epoch": 6.598302687411598, "grad_norm": 0.3597540557384491, "learning_rate": 4.23732652190316e-05, "loss": 0.1362, "num_input_tokens_seen": 14095072, "step": 13995 }, { "epoch": 6.600660066006601, "grad_norm": 1.1678272485733032, "learning_rate": 4.2365867321470806e-05, "loss": 0.1974, "num_input_tokens_seen": 14099616, "step": 14000 }, { "epoch": 6.603017444601603, "grad_norm": 2.00354266166687, "learning_rate": 4.235846648416353e-05, "loss": 0.0964, "num_input_tokens_seen": 14104352, "step": 14005 }, { "epoch": 6.605374823196605, "grad_norm": 0.30545735359191895, "learning_rate": 4.235106270836261e-05, "loss": 0.0644, "num_input_tokens_seen": 14109504, "step": 14010 }, { "epoch": 6.607732201791608, "grad_norm": 0.49373921751976013, "learning_rate": 4.2343655995321375e-05, "loss": 0.1716, "num_input_tokens_seen": 14114592, "step": 14015 }, { "epoch": 6.61008958038661, "grad_norm": 0.079251229763031, "learning_rate": 4.233624634629366e-05, "loss": 0.1485, "num_input_tokens_seen": 14118592, "step": 14020 }, { "epoch": 6.612446958981613, "grad_norm": 1.2263073921203613, "learning_rate": 4.232883376253379e-05, "loss": 0.2016, "num_input_tokens_seen": 14122048, "step": 14025 }, { "epoch": 6.614804337576615, "grad_norm": 0.04815242439508438, "learning_rate": 4.2321418245296595e-05, "loss": 0.0894, "num_input_tokens_seen": 14128704, "step": 14030 }, { "epoch": 6.617161716171617, "grad_norm": 1.0206711292266846, "learning_rate": 4.231399979583739e-05, "loss": 0.2617, "num_input_tokens_seen": 14134432, "step": 14035 }, { "epoch": 6.61951909476662, "grad_norm": 0.2604161500930786, "learning_rate": 4.2306578415411994e-05, "loss": 0.2404, "num_input_tokens_seen": 14141984, "step": 14040 }, { "epoch": 6.621876473361622, "grad_norm": 0.24392014741897583, "learning_rate": 4.229915410527672e-05, "loss": 0.2245, "num_input_tokens_seen": 14147456, "step": 14045 }, { "epoch": 6.624233851956625, "grad_norm": 0.5485674738883972, "learning_rate": 4.229172686668837e-05, "loss": 0.0572, "num_input_tokens_seen": 14152640, "step": 14050 }, { "epoch": 6.626591230551627, "grad_norm": 0.9348993301391602, "learning_rate": 4.228429670090428e-05, "loss": 0.108, "num_input_tokens_seen": 14156288, "step": 14055 }, { "epoch": 6.628948609146629, "grad_norm": 0.6045224070549011, "learning_rate": 4.227686360918221e-05, "loss": 0.0878, "num_input_tokens_seen": 14162048, "step": 14060 }, { "epoch": 6.631305987741631, "grad_norm": 0.47647130489349365, "learning_rate": 4.226942759278048e-05, "loss": 0.0856, "num_input_tokens_seen": 14166432, "step": 14065 }, { "epoch": 6.633663366336633, "grad_norm": 0.07722649723291397, "learning_rate": 4.226198865295789e-05, "loss": 0.0458, "num_input_tokens_seen": 14171040, "step": 14070 }, { "epoch": 6.636020744931636, "grad_norm": 0.409922331571579, "learning_rate": 4.2254546790973704e-05, "loss": 0.147, "num_input_tokens_seen": 14176224, "step": 14075 }, { "epoch": 6.638378123526638, "grad_norm": 0.4543857276439667, "learning_rate": 4.224710200808771e-05, "loss": 0.0807, "num_input_tokens_seen": 14181280, "step": 14080 }, { "epoch": 6.6407355021216405, "grad_norm": 1.5703860521316528, "learning_rate": 4.2239654305560186e-05, "loss": 0.1424, "num_input_tokens_seen": 14185376, "step": 14085 }, { "epoch": 6.643092880716643, "grad_norm": 1.2951496839523315, "learning_rate": 4.223220368465189e-05, "loss": 0.0776, "num_input_tokens_seen": 14189600, "step": 14090 }, { "epoch": 6.645450259311645, "grad_norm": 1.4600259065628052, "learning_rate": 4.222475014662411e-05, "loss": 0.1573, "num_input_tokens_seen": 14194656, "step": 14095 }, { "epoch": 6.647807637906648, "grad_norm": 0.0831858366727829, "learning_rate": 4.221729369273859e-05, "loss": 0.087, "num_input_tokens_seen": 14200128, "step": 14100 }, { "epoch": 6.65016501650165, "grad_norm": 1.5892388820648193, "learning_rate": 4.220983432425758e-05, "loss": 0.1868, "num_input_tokens_seen": 14205024, "step": 14105 }, { "epoch": 6.6525223950966526, "grad_norm": 1.0511566400527954, "learning_rate": 4.2202372042443816e-05, "loss": 0.0569, "num_input_tokens_seen": 14209088, "step": 14110 }, { "epoch": 6.654879773691655, "grad_norm": 1.4873170852661133, "learning_rate": 4.2194906848560555e-05, "loss": 0.1584, "num_input_tokens_seen": 14213600, "step": 14115 }, { "epoch": 6.657237152286657, "grad_norm": 1.1260432004928589, "learning_rate": 4.218743874387151e-05, "loss": 0.2236, "num_input_tokens_seen": 14218944, "step": 14120 }, { "epoch": 6.65959453088166, "grad_norm": 0.3263751268386841, "learning_rate": 4.217996772964092e-05, "loss": 0.1766, "num_input_tokens_seen": 14223552, "step": 14125 }, { "epoch": 6.661951909476662, "grad_norm": 0.5947125554084778, "learning_rate": 4.2172493807133485e-05, "loss": 0.0607, "num_input_tokens_seen": 14229152, "step": 14130 }, { "epoch": 6.664309288071665, "grad_norm": 0.4898747205734253, "learning_rate": 4.2165016977614424e-05, "loss": 0.047, "num_input_tokens_seen": 14234592, "step": 14135 }, { "epoch": 6.666666666666667, "grad_norm": 1.1919913291931152, "learning_rate": 4.2157537242349433e-05, "loss": 0.1701, "num_input_tokens_seen": 14240416, "step": 14140 }, { "epoch": 6.669024045261669, "grad_norm": 1.0290892124176025, "learning_rate": 4.2150054602604706e-05, "loss": 0.1845, "num_input_tokens_seen": 14244608, "step": 14145 }, { "epoch": 6.671381423856672, "grad_norm": 1.607894778251648, "learning_rate": 4.214256905964693e-05, "loss": 0.2033, "num_input_tokens_seen": 14249888, "step": 14150 }, { "epoch": 6.673738802451673, "grad_norm": 0.3663918375968933, "learning_rate": 4.213508061474327e-05, "loss": 0.0196, "num_input_tokens_seen": 14254976, "step": 14155 }, { "epoch": 6.676096181046676, "grad_norm": 0.8879036903381348, "learning_rate": 4.2127589269161405e-05, "loss": 0.1411, "num_input_tokens_seen": 14259264, "step": 14160 }, { "epoch": 6.678453559641678, "grad_norm": 0.8115882873535156, "learning_rate": 4.212009502416948e-05, "loss": 0.1729, "num_input_tokens_seen": 14263872, "step": 14165 }, { "epoch": 6.6808109382366805, "grad_norm": 1.2638782262802124, "learning_rate": 4.2112597881036156e-05, "loss": 0.0781, "num_input_tokens_seen": 14269728, "step": 14170 }, { "epoch": 6.683168316831683, "grad_norm": 0.5950202941894531, "learning_rate": 4.210509784103056e-05, "loss": 0.0431, "num_input_tokens_seen": 14274336, "step": 14175 }, { "epoch": 6.685525695426685, "grad_norm": 0.6455886960029602, "learning_rate": 4.2097594905422335e-05, "loss": 0.0875, "num_input_tokens_seen": 14279136, "step": 14180 }, { "epoch": 6.687883074021688, "grad_norm": 0.17079028487205505, "learning_rate": 4.209008907548159e-05, "loss": 0.1746, "num_input_tokens_seen": 14283680, "step": 14185 }, { "epoch": 6.69024045261669, "grad_norm": 0.5507495403289795, "learning_rate": 4.208258035247894e-05, "loss": 0.1463, "num_input_tokens_seen": 14289216, "step": 14190 }, { "epoch": 6.6925978312116925, "grad_norm": 0.20729216933250427, "learning_rate": 4.207506873768547e-05, "loss": 0.1758, "num_input_tokens_seen": 14293984, "step": 14195 }, { "epoch": 6.694955209806695, "grad_norm": 0.7077010273933411, "learning_rate": 4.2067554232372794e-05, "loss": 0.0883, "num_input_tokens_seen": 14298624, "step": 14200 }, { "epoch": 6.697312588401697, "grad_norm": 1.1083916425704956, "learning_rate": 4.206003683781298e-05, "loss": 0.1047, "num_input_tokens_seen": 14304032, "step": 14205 }, { "epoch": 6.6996699669967, "grad_norm": 2.2854506969451904, "learning_rate": 4.2052516555278576e-05, "loss": 0.2166, "num_input_tokens_seen": 14308064, "step": 14210 }, { "epoch": 6.702027345591702, "grad_norm": 1.1393765211105347, "learning_rate": 4.204499338604267e-05, "loss": 0.1337, "num_input_tokens_seen": 14312064, "step": 14215 }, { "epoch": 6.7043847241867045, "grad_norm": 0.16637755930423737, "learning_rate": 4.203746733137878e-05, "loss": 0.192, "num_input_tokens_seen": 14316928, "step": 14220 }, { "epoch": 6.706742102781707, "grad_norm": 0.8671988248825073, "learning_rate": 4.2029938392560956e-05, "loss": 0.1264, "num_input_tokens_seen": 14322336, "step": 14225 }, { "epoch": 6.709099481376709, "grad_norm": 0.14540283381938934, "learning_rate": 4.202240657086371e-05, "loss": 0.1725, "num_input_tokens_seen": 14327968, "step": 14230 }, { "epoch": 6.711456859971712, "grad_norm": 1.3432871103286743, "learning_rate": 4.201487186756207e-05, "loss": 0.3851, "num_input_tokens_seen": 14332992, "step": 14235 }, { "epoch": 6.713814238566714, "grad_norm": 1.3101028203964233, "learning_rate": 4.20073342839315e-05, "loss": 0.2909, "num_input_tokens_seen": 14337792, "step": 14240 }, { "epoch": 6.7161716171617165, "grad_norm": 0.27777570486068726, "learning_rate": 4.1999793821248016e-05, "loss": 0.2265, "num_input_tokens_seen": 14342720, "step": 14245 }, { "epoch": 6.718528995756719, "grad_norm": 0.182470440864563, "learning_rate": 4.1992250480788075e-05, "loss": 0.1139, "num_input_tokens_seen": 14348000, "step": 14250 }, { "epoch": 6.720886374351721, "grad_norm": 0.15660248696804047, "learning_rate": 4.1984704263828634e-05, "loss": 0.1385, "num_input_tokens_seen": 14355072, "step": 14255 }, { "epoch": 6.723243752946724, "grad_norm": 0.38533636927604675, "learning_rate": 4.197715517164715e-05, "loss": 0.1054, "num_input_tokens_seen": 14360512, "step": 14260 }, { "epoch": 6.725601131541725, "grad_norm": 1.9634782075881958, "learning_rate": 4.1969603205521554e-05, "loss": 0.1598, "num_input_tokens_seen": 14364864, "step": 14265 }, { "epoch": 6.727958510136728, "grad_norm": 1.36394202709198, "learning_rate": 4.1962048366730254e-05, "loss": 0.4257, "num_input_tokens_seen": 14369312, "step": 14270 }, { "epoch": 6.73031588873173, "grad_norm": 0.7670515775680542, "learning_rate": 4.195449065655217e-05, "loss": 0.1595, "num_input_tokens_seen": 14373920, "step": 14275 }, { "epoch": 6.732673267326732, "grad_norm": 1.2369240522384644, "learning_rate": 4.194693007626669e-05, "loss": 0.2387, "num_input_tokens_seen": 14379072, "step": 14280 }, { "epoch": 6.735030645921735, "grad_norm": 0.5518625974655151, "learning_rate": 4.193936662715369e-05, "loss": 0.1191, "num_input_tokens_seen": 14383520, "step": 14285 }, { "epoch": 6.737388024516737, "grad_norm": 1.96601140499115, "learning_rate": 4.193180031049354e-05, "loss": 0.1049, "num_input_tokens_seen": 14388160, "step": 14290 }, { "epoch": 6.73974540311174, "grad_norm": 0.5002797842025757, "learning_rate": 4.192423112756708e-05, "loss": 0.1387, "num_input_tokens_seen": 14392960, "step": 14295 }, { "epoch": 6.742102781706742, "grad_norm": 0.965576708316803, "learning_rate": 4.191665907965564e-05, "loss": 0.1302, "num_input_tokens_seen": 14400096, "step": 14300 }, { "epoch": 6.7444601603017444, "grad_norm": 0.24429574608802795, "learning_rate": 4.190908416804105e-05, "loss": 0.212, "num_input_tokens_seen": 14404096, "step": 14305 }, { "epoch": 6.746817538896747, "grad_norm": 0.32135286927223206, "learning_rate": 4.190150639400561e-05, "loss": 0.0586, "num_input_tokens_seen": 14409184, "step": 14310 }, { "epoch": 6.749174917491749, "grad_norm": 0.0994291752576828, "learning_rate": 4.189392575883211e-05, "loss": 0.2466, "num_input_tokens_seen": 14414112, "step": 14315 }, { "epoch": 6.751532296086752, "grad_norm": 0.09578340500593185, "learning_rate": 4.188634226380382e-05, "loss": 0.048, "num_input_tokens_seen": 14418976, "step": 14320 }, { "epoch": 6.753889674681754, "grad_norm": 1.2016175985336304, "learning_rate": 4.1878755910204495e-05, "loss": 0.0671, "num_input_tokens_seen": 14424992, "step": 14325 }, { "epoch": 6.7562470532767565, "grad_norm": 0.2967555820941925, "learning_rate": 4.1871166699318384e-05, "loss": 0.1964, "num_input_tokens_seen": 14431680, "step": 14330 }, { "epoch": 6.758604431871759, "grad_norm": 0.8896477222442627, "learning_rate": 4.186357463243021e-05, "loss": 0.1711, "num_input_tokens_seen": 14435776, "step": 14335 }, { "epoch": 6.760961810466761, "grad_norm": 0.29425084590911865, "learning_rate": 4.1855979710825173e-05, "loss": 0.0789, "num_input_tokens_seen": 14440704, "step": 14340 }, { "epoch": 6.763319189061764, "grad_norm": 2.180748462677002, "learning_rate": 4.1848381935788975e-05, "loss": 0.1469, "num_input_tokens_seen": 14445792, "step": 14345 }, { "epoch": 6.765676567656766, "grad_norm": 0.2480921745300293, "learning_rate": 4.1840781308607776e-05, "loss": 0.1052, "num_input_tokens_seen": 14450144, "step": 14350 }, { "epoch": 6.768033946251768, "grad_norm": 0.7137916684150696, "learning_rate": 4.183317783056825e-05, "loss": 0.0849, "num_input_tokens_seen": 14454592, "step": 14355 }, { "epoch": 6.77039132484677, "grad_norm": 0.8686250448226929, "learning_rate": 4.1825571502957514e-05, "loss": 0.2981, "num_input_tokens_seen": 14459360, "step": 14360 }, { "epoch": 6.772748703441772, "grad_norm": 1.3512685298919678, "learning_rate": 4.181796232706322e-05, "loss": 0.3288, "num_input_tokens_seen": 14463840, "step": 14365 }, { "epoch": 6.775106082036775, "grad_norm": 0.1982029676437378, "learning_rate": 4.181035030417345e-05, "loss": 0.1073, "num_input_tokens_seen": 14469280, "step": 14370 }, { "epoch": 6.777463460631777, "grad_norm": 1.3202447891235352, "learning_rate": 4.18027354355768e-05, "loss": 0.1962, "num_input_tokens_seen": 14473408, "step": 14375 }, { "epoch": 6.77982083922678, "grad_norm": 0.7614594101905823, "learning_rate": 4.179511772256233e-05, "loss": 0.0829, "num_input_tokens_seen": 14478144, "step": 14380 }, { "epoch": 6.782178217821782, "grad_norm": 0.5014420747756958, "learning_rate": 4.17874971664196e-05, "loss": 0.1967, "num_input_tokens_seen": 14483008, "step": 14385 }, { "epoch": 6.784535596416784, "grad_norm": 0.2387993186712265, "learning_rate": 4.177987376843863e-05, "loss": 0.1215, "num_input_tokens_seen": 14488032, "step": 14390 }, { "epoch": 6.786892975011787, "grad_norm": 0.7924051284790039, "learning_rate": 4.1772247529909946e-05, "loss": 0.1772, "num_input_tokens_seen": 14493856, "step": 14395 }, { "epoch": 6.789250353606789, "grad_norm": 0.3076326549053192, "learning_rate": 4.176461845212452e-05, "loss": 0.1364, "num_input_tokens_seen": 14498528, "step": 14400 }, { "epoch": 6.791607732201792, "grad_norm": 1.5635855197906494, "learning_rate": 4.1756986536373844e-05, "loss": 0.1775, "num_input_tokens_seen": 14503680, "step": 14405 }, { "epoch": 6.793965110796794, "grad_norm": 2.9571685791015625, "learning_rate": 4.174935178394987e-05, "loss": 0.4332, "num_input_tokens_seen": 14509440, "step": 14410 }, { "epoch": 6.796322489391796, "grad_norm": 0.21571074426174164, "learning_rate": 4.174171419614502e-05, "loss": 0.1141, "num_input_tokens_seen": 14514112, "step": 14415 }, { "epoch": 6.798679867986799, "grad_norm": 1.1330314874649048, "learning_rate": 4.1734073774252214e-05, "loss": 0.0925, "num_input_tokens_seen": 14519904, "step": 14420 }, { "epoch": 6.801037246581801, "grad_norm": 1.3675533533096313, "learning_rate": 4.172643051956485e-05, "loss": 0.1781, "num_input_tokens_seen": 14524576, "step": 14425 }, { "epoch": 6.803394625176804, "grad_norm": 0.40346965193748474, "learning_rate": 4.171878443337679e-05, "loss": 0.0966, "num_input_tokens_seen": 14530528, "step": 14430 }, { "epoch": 6.805752003771806, "grad_norm": 1.0774134397506714, "learning_rate": 4.17111355169824e-05, "loss": 0.0956, "num_input_tokens_seen": 14535552, "step": 14435 }, { "epoch": 6.808109382366808, "grad_norm": 0.0735764279961586, "learning_rate": 4.170348377167651e-05, "loss": 0.1293, "num_input_tokens_seen": 14539392, "step": 14440 }, { "epoch": 6.810466760961811, "grad_norm": 0.6533068418502808, "learning_rate": 4.169582919875442e-05, "loss": 0.1533, "num_input_tokens_seen": 14545536, "step": 14445 }, { "epoch": 6.812824139556813, "grad_norm": 0.6389421820640564, "learning_rate": 4.1688171799511925e-05, "loss": 0.0859, "num_input_tokens_seen": 14549504, "step": 14450 }, { "epoch": 6.815181518151816, "grad_norm": 0.17812123894691467, "learning_rate": 4.1680511575245294e-05, "loss": 0.2385, "num_input_tokens_seen": 14555136, "step": 14455 }, { "epoch": 6.817538896746818, "grad_norm": 0.9302536845207214, "learning_rate": 4.1672848527251274e-05, "loss": 0.2515, "num_input_tokens_seen": 14559968, "step": 14460 }, { "epoch": 6.8198962753418195, "grad_norm": 0.19966436922550201, "learning_rate": 4.166518265682709e-05, "loss": 0.2692, "num_input_tokens_seen": 14564320, "step": 14465 }, { "epoch": 6.822253653936822, "grad_norm": 1.2019678354263306, "learning_rate": 4.165751396527044e-05, "loss": 0.2262, "num_input_tokens_seen": 14569376, "step": 14470 }, { "epoch": 6.824611032531824, "grad_norm": 0.27529144287109375, "learning_rate": 4.16498424538795e-05, "loss": 0.0457, "num_input_tokens_seen": 14575328, "step": 14475 }, { "epoch": 6.826968411126827, "grad_norm": 0.8462657928466797, "learning_rate": 4.1642168123952936e-05, "loss": 0.2331, "num_input_tokens_seen": 14580960, "step": 14480 }, { "epoch": 6.829325789721829, "grad_norm": 0.12885448336601257, "learning_rate": 4.163449097678987e-05, "loss": 0.1617, "num_input_tokens_seen": 14587776, "step": 14485 }, { "epoch": 6.8316831683168315, "grad_norm": 1.3941631317138672, "learning_rate": 4.1626811013689925e-05, "loss": 0.1394, "num_input_tokens_seen": 14592992, "step": 14490 }, { "epoch": 6.834040546911834, "grad_norm": 0.10822591930627823, "learning_rate": 4.1619128235953177e-05, "loss": 0.0679, "num_input_tokens_seen": 14597536, "step": 14495 }, { "epoch": 6.836397925506836, "grad_norm": 1.67603600025177, "learning_rate": 4.161144264488021e-05, "loss": 0.1465, "num_input_tokens_seen": 14602400, "step": 14500 }, { "epoch": 6.838755304101839, "grad_norm": 0.3291435241699219, "learning_rate": 4.160375424177203e-05, "loss": 0.0342, "num_input_tokens_seen": 14606112, "step": 14505 }, { "epoch": 6.841112682696841, "grad_norm": 0.15515835583209991, "learning_rate": 4.1596063027930185e-05, "loss": 0.1328, "num_input_tokens_seen": 14610400, "step": 14510 }, { "epoch": 6.8434700612918435, "grad_norm": 1.2089608907699585, "learning_rate": 4.158836900465665e-05, "loss": 0.207, "num_input_tokens_seen": 14615712, "step": 14515 }, { "epoch": 6.845827439886846, "grad_norm": 2.192171812057495, "learning_rate": 4.15806721732539e-05, "loss": 0.2916, "num_input_tokens_seen": 14620768, "step": 14520 }, { "epoch": 6.848184818481848, "grad_norm": 0.48559296131134033, "learning_rate": 4.1572972535024865e-05, "loss": 0.1078, "num_input_tokens_seen": 14625408, "step": 14525 }, { "epoch": 6.850542197076851, "grad_norm": 1.0310664176940918, "learning_rate": 4.156527009127298e-05, "loss": 0.4286, "num_input_tokens_seen": 14630528, "step": 14530 }, { "epoch": 6.852899575671853, "grad_norm": 0.3352069556713104, "learning_rate": 4.155756484330213e-05, "loss": 0.076, "num_input_tokens_seen": 14635968, "step": 14535 }, { "epoch": 6.8552569542668556, "grad_norm": 0.21386320888996124, "learning_rate": 4.154985679241668e-05, "loss": 0.3006, "num_input_tokens_seen": 14642272, "step": 14540 }, { "epoch": 6.857614332861858, "grad_norm": 0.19055728614330292, "learning_rate": 4.154214593992149e-05, "loss": 0.0456, "num_input_tokens_seen": 14646976, "step": 14545 }, { "epoch": 6.85997171145686, "grad_norm": 0.1865151822566986, "learning_rate": 4.153443228712185e-05, "loss": 0.0449, "num_input_tokens_seen": 14651488, "step": 14550 }, { "epoch": 6.862329090051862, "grad_norm": 0.17798642814159393, "learning_rate": 4.152671583532357e-05, "loss": 0.0988, "num_input_tokens_seen": 14656352, "step": 14555 }, { "epoch": 6.864686468646864, "grad_norm": 0.061655063182115555, "learning_rate": 4.151899658583289e-05, "loss": 0.1021, "num_input_tokens_seen": 14661536, "step": 14560 }, { "epoch": 6.867043847241867, "grad_norm": 1.098888635635376, "learning_rate": 4.151127453995658e-05, "loss": 0.2142, "num_input_tokens_seen": 14666592, "step": 14565 }, { "epoch": 6.869401225836869, "grad_norm": 0.10582344233989716, "learning_rate": 4.150354969900183e-05, "loss": 0.1348, "num_input_tokens_seen": 14671456, "step": 14570 }, { "epoch": 6.8717586044318715, "grad_norm": 1.0337018966674805, "learning_rate": 4.149582206427633e-05, "loss": 0.234, "num_input_tokens_seen": 14677568, "step": 14575 }, { "epoch": 6.874115983026874, "grad_norm": 1.3021234273910522, "learning_rate": 4.1488091637088244e-05, "loss": 0.0632, "num_input_tokens_seen": 14682400, "step": 14580 }, { "epoch": 6.876473361621876, "grad_norm": 0.36071598529815674, "learning_rate": 4.148035841874619e-05, "loss": 0.1263, "num_input_tokens_seen": 14688672, "step": 14585 }, { "epoch": 6.878830740216879, "grad_norm": 0.42824873328208923, "learning_rate": 4.147262241055928e-05, "loss": 0.1827, "num_input_tokens_seen": 14693312, "step": 14590 }, { "epoch": 6.881188118811881, "grad_norm": 0.17130792140960693, "learning_rate": 4.146488361383708e-05, "loss": 0.0727, "num_input_tokens_seen": 14698624, "step": 14595 }, { "epoch": 6.8835454974068835, "grad_norm": 0.2894497513771057, "learning_rate": 4.145714202988965e-05, "loss": 0.215, "num_input_tokens_seen": 14703584, "step": 14600 }, { "epoch": 6.885902876001886, "grad_norm": 0.4445306360721588, "learning_rate": 4.1449397660027495e-05, "loss": 0.1351, "num_input_tokens_seen": 14707424, "step": 14605 }, { "epoch": 6.888260254596888, "grad_norm": 0.8292264342308044, "learning_rate": 4.144165050556161e-05, "loss": 0.0671, "num_input_tokens_seen": 14712224, "step": 14610 }, { "epoch": 6.890617633191891, "grad_norm": 0.14510804414749146, "learning_rate": 4.1433900567803464e-05, "loss": 0.0758, "num_input_tokens_seen": 14717184, "step": 14615 }, { "epoch": 6.892975011786893, "grad_norm": 0.3049812316894531, "learning_rate": 4.1426147848064974e-05, "loss": 0.0915, "num_input_tokens_seen": 14721824, "step": 14620 }, { "epoch": 6.8953323903818955, "grad_norm": 0.9612558484077454, "learning_rate": 4.1418392347658555e-05, "loss": 0.3243, "num_input_tokens_seen": 14726464, "step": 14625 }, { "epoch": 6.897689768976898, "grad_norm": 0.968523383140564, "learning_rate": 4.141063406789708e-05, "loss": 0.3701, "num_input_tokens_seen": 14731712, "step": 14630 }, { "epoch": 6.9000471475719, "grad_norm": 0.21588356792926788, "learning_rate": 4.1402873010093904e-05, "loss": 0.1976, "num_input_tokens_seen": 14736416, "step": 14635 }, { "epoch": 6.902404526166903, "grad_norm": 0.6311640739440918, "learning_rate": 4.139510917556283e-05, "loss": 0.1535, "num_input_tokens_seen": 14741888, "step": 14640 }, { "epoch": 6.904761904761905, "grad_norm": 0.05591054633259773, "learning_rate": 4.1387342565618134e-05, "loss": 0.1272, "num_input_tokens_seen": 14747488, "step": 14645 }, { "epoch": 6.9071192833569075, "grad_norm": 1.783203363418579, "learning_rate": 4.1379573181574596e-05, "loss": 0.2388, "num_input_tokens_seen": 14751680, "step": 14650 }, { "epoch": 6.90947666195191, "grad_norm": 1.0429673194885254, "learning_rate": 4.137180102474742e-05, "loss": 0.0622, "num_input_tokens_seen": 14756320, "step": 14655 }, { "epoch": 6.911834040546912, "grad_norm": 0.4944562017917633, "learning_rate": 4.1364026096452316e-05, "loss": 0.1033, "num_input_tokens_seen": 14760544, "step": 14660 }, { "epoch": 6.914191419141914, "grad_norm": 0.45502257347106934, "learning_rate": 4.135624839800543e-05, "loss": 0.1532, "num_input_tokens_seen": 14765856, "step": 14665 }, { "epoch": 6.916548797736916, "grad_norm": 0.04968572035431862, "learning_rate": 4.13484679307234e-05, "loss": 0.0501, "num_input_tokens_seen": 14770048, "step": 14670 }, { "epoch": 6.918906176331919, "grad_norm": 0.3815339207649231, "learning_rate": 4.134068469592334e-05, "loss": 0.3648, "num_input_tokens_seen": 14774848, "step": 14675 }, { "epoch": 6.921263554926921, "grad_norm": 0.1272626668214798, "learning_rate": 4.1332898694922804e-05, "loss": 0.1889, "num_input_tokens_seen": 14779584, "step": 14680 }, { "epoch": 6.923620933521923, "grad_norm": 0.894743025302887, "learning_rate": 4.132510992903984e-05, "loss": 0.1167, "num_input_tokens_seen": 14783616, "step": 14685 }, { "epoch": 6.925978312116926, "grad_norm": 0.3647109568119049, "learning_rate": 4.131731839959295e-05, "loss": 0.0679, "num_input_tokens_seen": 14788928, "step": 14690 }, { "epoch": 6.928335690711928, "grad_norm": 1.0289851427078247, "learning_rate": 4.130952410790111e-05, "loss": 0.0915, "num_input_tokens_seen": 14793792, "step": 14695 }, { "epoch": 6.930693069306931, "grad_norm": 0.21021921932697296, "learning_rate": 4.130172705528375e-05, "loss": 0.1498, "num_input_tokens_seen": 14799200, "step": 14700 }, { "epoch": 6.933050447901933, "grad_norm": 0.5083552598953247, "learning_rate": 4.1293927243060795e-05, "loss": 0.0888, "num_input_tokens_seen": 14804896, "step": 14705 }, { "epoch": 6.935407826496935, "grad_norm": 0.1462305635213852, "learning_rate": 4.128612467255261e-05, "loss": 0.1529, "num_input_tokens_seen": 14808864, "step": 14710 }, { "epoch": 6.937765205091938, "grad_norm": 0.6995662450790405, "learning_rate": 4.1278319345080036e-05, "loss": 0.0882, "num_input_tokens_seen": 14814688, "step": 14715 }, { "epoch": 6.94012258368694, "grad_norm": 0.374756783246994, "learning_rate": 4.127051126196439e-05, "loss": 0.162, "num_input_tokens_seen": 14819008, "step": 14720 }, { "epoch": 6.942479962281943, "grad_norm": 0.08819115906953812, "learning_rate": 4.1262700424527445e-05, "loss": 0.2198, "num_input_tokens_seen": 14824736, "step": 14725 }, { "epoch": 6.944837340876945, "grad_norm": 0.19742292165756226, "learning_rate": 4.125488683409144e-05, "loss": 0.1135, "num_input_tokens_seen": 14829376, "step": 14730 }, { "epoch": 6.947194719471947, "grad_norm": 0.0457027330994606, "learning_rate": 4.124707049197909e-05, "loss": 0.0768, "num_input_tokens_seen": 14834080, "step": 14735 }, { "epoch": 6.94955209806695, "grad_norm": 0.5682111978530884, "learning_rate": 4.123925139951357e-05, "loss": 0.0381, "num_input_tokens_seen": 14839680, "step": 14740 }, { "epoch": 6.951909476661952, "grad_norm": 2.4538872241973877, "learning_rate": 4.123142955801851e-05, "loss": 0.048, "num_input_tokens_seen": 14844224, "step": 14745 }, { "epoch": 6.954266855256955, "grad_norm": 0.30503711104393005, "learning_rate": 4.122360496881802e-05, "loss": 0.0583, "num_input_tokens_seen": 14849216, "step": 14750 }, { "epoch": 6.956624233851956, "grad_norm": 1.7914843559265137, "learning_rate": 4.121577763323667e-05, "loss": 0.1472, "num_input_tokens_seen": 14854688, "step": 14755 }, { "epoch": 6.958981612446959, "grad_norm": 0.860319972038269, "learning_rate": 4.1207947552599495e-05, "loss": 0.2468, "num_input_tokens_seen": 14859200, "step": 14760 }, { "epoch": 6.961338991041961, "grad_norm": 2.008934259414673, "learning_rate": 4.1200114728232e-05, "loss": 0.3713, "num_input_tokens_seen": 14865152, "step": 14765 }, { "epoch": 6.963696369636963, "grad_norm": 0.6113480925559998, "learning_rate": 4.119227916146014e-05, "loss": 0.0799, "num_input_tokens_seen": 14870432, "step": 14770 }, { "epoch": 6.966053748231966, "grad_norm": 0.35347190499305725, "learning_rate": 4.118444085361034e-05, "loss": 0.0785, "num_input_tokens_seen": 14875808, "step": 14775 }, { "epoch": 6.968411126826968, "grad_norm": 0.117021344602108, "learning_rate": 4.11765998060095e-05, "loss": 0.1365, "num_input_tokens_seen": 14881088, "step": 14780 }, { "epoch": 6.970768505421971, "grad_norm": 0.10344883799552917, "learning_rate": 4.116875601998498e-05, "loss": 0.1145, "num_input_tokens_seen": 14886528, "step": 14785 }, { "epoch": 6.973125884016973, "grad_norm": 0.17563439905643463, "learning_rate": 4.11609094968646e-05, "loss": 0.0306, "num_input_tokens_seen": 14891008, "step": 14790 }, { "epoch": 6.975483262611975, "grad_norm": 0.3864651918411255, "learning_rate": 4.115306023797664e-05, "loss": 0.0238, "num_input_tokens_seen": 14895712, "step": 14795 }, { "epoch": 6.977840641206978, "grad_norm": 0.06198040023446083, "learning_rate": 4.1145208244649836e-05, "loss": 0.1115, "num_input_tokens_seen": 14901728, "step": 14800 }, { "epoch": 6.98019801980198, "grad_norm": 0.112961545586586, "learning_rate": 4.113735351821341e-05, "loss": 0.0765, "num_input_tokens_seen": 14908160, "step": 14805 }, { "epoch": 6.982555398396983, "grad_norm": 0.15426157414913177, "learning_rate": 4.112949605999703e-05, "loss": 0.0659, "num_input_tokens_seen": 14914016, "step": 14810 }, { "epoch": 6.984912776991985, "grad_norm": 1.586976170539856, "learning_rate": 4.112163587133083e-05, "loss": 0.1219, "num_input_tokens_seen": 14918400, "step": 14815 }, { "epoch": 6.987270155586987, "grad_norm": 0.5818902850151062, "learning_rate": 4.111377295354541e-05, "loss": 0.1934, "num_input_tokens_seen": 14923168, "step": 14820 }, { "epoch": 6.98962753418199, "grad_norm": 0.6733802556991577, "learning_rate": 4.110590730797182e-05, "loss": 0.0542, "num_input_tokens_seen": 14928064, "step": 14825 }, { "epoch": 6.991984912776992, "grad_norm": 0.2843051254749298, "learning_rate": 4.1098038935941594e-05, "loss": 0.0726, "num_input_tokens_seen": 14933728, "step": 14830 }, { "epoch": 6.994342291371995, "grad_norm": 0.10262142866849899, "learning_rate": 4.10901678387867e-05, "loss": 0.06, "num_input_tokens_seen": 14938944, "step": 14835 }, { "epoch": 6.996699669966997, "grad_norm": 0.29426759481430054, "learning_rate": 4.1082294017839586e-05, "loss": 0.1174, "num_input_tokens_seen": 14944736, "step": 14840 }, { "epoch": 6.999057048561999, "grad_norm": 0.10055702924728394, "learning_rate": 4.107441747443315e-05, "loss": 0.0268, "num_input_tokens_seen": 14948800, "step": 14845 }, { "epoch": 7.0, "eval_loss": 0.15092287957668304, "eval_runtime": 15.1233, "eval_samples_per_second": 62.354, "eval_steps_per_second": 15.605, "num_input_tokens_seen": 14950432, "step": 14847 }, { "epoch": 7.001414427157002, "grad_norm": 0.7252532243728638, "learning_rate": 4.1066538209900776e-05, "loss": 0.2137, "num_input_tokens_seen": 14953344, "step": 14850 }, { "epoch": 7.003771805752004, "grad_norm": 0.11158695816993713, "learning_rate": 4.105865622557627e-05, "loss": 0.0225, "num_input_tokens_seen": 14958496, "step": 14855 }, { "epoch": 7.006129184347006, "grad_norm": 0.559172511100769, "learning_rate": 4.105077152279393e-05, "loss": 0.248, "num_input_tokens_seen": 14963616, "step": 14860 }, { "epoch": 7.008486562942008, "grad_norm": 0.48664912581443787, "learning_rate": 4.104288410288849e-05, "loss": 0.0644, "num_input_tokens_seen": 14968320, "step": 14865 }, { "epoch": 7.0108439415370105, "grad_norm": 0.16977037489414215, "learning_rate": 4.103499396719517e-05, "loss": 0.1157, "num_input_tokens_seen": 14974016, "step": 14870 }, { "epoch": 7.013201320132013, "grad_norm": 1.0109308958053589, "learning_rate": 4.1027101117049635e-05, "loss": 0.1041, "num_input_tokens_seen": 14980416, "step": 14875 }, { "epoch": 7.015558698727015, "grad_norm": 0.3547460436820984, "learning_rate": 4.1019205553787995e-05, "loss": 0.0843, "num_input_tokens_seen": 14985536, "step": 14880 }, { "epoch": 7.017916077322018, "grad_norm": 1.1892321109771729, "learning_rate": 4.1011307278746844e-05, "loss": 0.1156, "num_input_tokens_seen": 14991648, "step": 14885 }, { "epoch": 7.02027345591702, "grad_norm": 0.03840968757867813, "learning_rate": 4.100340629326324e-05, "loss": 0.1782, "num_input_tokens_seen": 14996640, "step": 14890 }, { "epoch": 7.0226308345120225, "grad_norm": 1.1225568056106567, "learning_rate": 4.0995502598674664e-05, "loss": 0.1741, "num_input_tokens_seen": 15001120, "step": 14895 }, { "epoch": 7.024988213107025, "grad_norm": 0.06410740315914154, "learning_rate": 4.098759619631909e-05, "loss": 0.0571, "num_input_tokens_seen": 15006496, "step": 14900 }, { "epoch": 7.027345591702027, "grad_norm": 0.6083229184150696, "learning_rate": 4.0979687087534934e-05, "loss": 0.0605, "num_input_tokens_seen": 15011520, "step": 14905 }, { "epoch": 7.02970297029703, "grad_norm": 1.488634467124939, "learning_rate": 4.097177527366107e-05, "loss": 0.1391, "num_input_tokens_seen": 15017152, "step": 14910 }, { "epoch": 7.032060348892032, "grad_norm": 0.9037811160087585, "learning_rate": 4.096386075603684e-05, "loss": 0.2959, "num_input_tokens_seen": 15021824, "step": 14915 }, { "epoch": 7.0344177274870345, "grad_norm": 0.05482213944196701, "learning_rate": 4.095594353600203e-05, "loss": 0.0732, "num_input_tokens_seen": 15027968, "step": 14920 }, { "epoch": 7.036775106082037, "grad_norm": 1.983161449432373, "learning_rate": 4.0948023614896905e-05, "loss": 0.1793, "num_input_tokens_seen": 15032640, "step": 14925 }, { "epoch": 7.039132484677039, "grad_norm": 0.7677502632141113, "learning_rate": 4.094010099406216e-05, "loss": 0.1125, "num_input_tokens_seen": 15037216, "step": 14930 }, { "epoch": 7.041489863272042, "grad_norm": 0.6957078576087952, "learning_rate": 4.093217567483896e-05, "loss": 0.1568, "num_input_tokens_seen": 15041088, "step": 14935 }, { "epoch": 7.043847241867044, "grad_norm": 1.3740878105163574, "learning_rate": 4.0924247658568935e-05, "loss": 0.2211, "num_input_tokens_seen": 15045632, "step": 14940 }, { "epoch": 7.0462046204620465, "grad_norm": 2.1408159732818604, "learning_rate": 4.091631694659416e-05, "loss": 0.0786, "num_input_tokens_seen": 15049536, "step": 14945 }, { "epoch": 7.048561999057049, "grad_norm": 0.8325873017311096, "learning_rate": 4.0908383540257166e-05, "loss": 0.0665, "num_input_tokens_seen": 15053888, "step": 14950 }, { "epoch": 7.050919377652051, "grad_norm": 0.10826106369495392, "learning_rate": 4.090044744090096e-05, "loss": 0.0195, "num_input_tokens_seen": 15059040, "step": 14955 }, { "epoch": 7.053276756247053, "grad_norm": 0.6243112087249756, "learning_rate": 4.089250864986896e-05, "loss": 0.1211, "num_input_tokens_seen": 15063360, "step": 14960 }, { "epoch": 7.055634134842055, "grad_norm": 0.07998611778020859, "learning_rate": 4.08845671685051e-05, "loss": 0.1339, "num_input_tokens_seen": 15070080, "step": 14965 }, { "epoch": 7.057991513437058, "grad_norm": 0.05319840461015701, "learning_rate": 4.0876622998153716e-05, "loss": 0.2097, "num_input_tokens_seen": 15075328, "step": 14970 }, { "epoch": 7.06034889203206, "grad_norm": 0.1066230833530426, "learning_rate": 4.0868676140159624e-05, "loss": 0.1116, "num_input_tokens_seen": 15080192, "step": 14975 }, { "epoch": 7.0627062706270625, "grad_norm": 0.7309447526931763, "learning_rate": 4.08607265958681e-05, "loss": 0.0669, "num_input_tokens_seen": 15085856, "step": 14980 }, { "epoch": 7.065063649222065, "grad_norm": 0.7373453378677368, "learning_rate": 4.085277436662486e-05, "loss": 0.0553, "num_input_tokens_seen": 15090432, "step": 14985 }, { "epoch": 7.067421027817067, "grad_norm": 1.912026286125183, "learning_rate": 4.0844819453776086e-05, "loss": 0.1604, "num_input_tokens_seen": 15095296, "step": 14990 }, { "epoch": 7.06977840641207, "grad_norm": 0.5904443264007568, "learning_rate": 4.083686185866841e-05, "loss": 0.0808, "num_input_tokens_seen": 15100640, "step": 14995 }, { "epoch": 7.072135785007072, "grad_norm": 0.9040495753288269, "learning_rate": 4.082890158264891e-05, "loss": 0.1896, "num_input_tokens_seen": 15104544, "step": 15000 }, { "epoch": 7.0744931636020745, "grad_norm": 0.34303364157676697, "learning_rate": 4.082093862706514e-05, "loss": 0.0514, "num_input_tokens_seen": 15109312, "step": 15005 }, { "epoch": 7.076850542197077, "grad_norm": 1.54231595993042, "learning_rate": 4.081297299326509e-05, "loss": 0.1206, "num_input_tokens_seen": 15114368, "step": 15010 }, { "epoch": 7.079207920792079, "grad_norm": 0.06533148139715195, "learning_rate": 4.080500468259719e-05, "loss": 0.1017, "num_input_tokens_seen": 15120128, "step": 15015 }, { "epoch": 7.081565299387082, "grad_norm": 0.06004057452082634, "learning_rate": 4.079703369641035e-05, "loss": 0.1012, "num_input_tokens_seen": 15126208, "step": 15020 }, { "epoch": 7.083922677982084, "grad_norm": 1.1371327638626099, "learning_rate": 4.0789060036053926e-05, "loss": 0.1872, "num_input_tokens_seen": 15131552, "step": 15025 }, { "epoch": 7.0862800565770865, "grad_norm": 2.128774642944336, "learning_rate": 4.078108370287772e-05, "loss": 0.1112, "num_input_tokens_seen": 15137312, "step": 15030 }, { "epoch": 7.088637435172089, "grad_norm": 1.3242888450622559, "learning_rate": 4.0773104698232e-05, "loss": 0.1738, "num_input_tokens_seen": 15141536, "step": 15035 }, { "epoch": 7.090994813767091, "grad_norm": 1.3105465173721313, "learning_rate": 4.076512302346746e-05, "loss": 0.2285, "num_input_tokens_seen": 15147232, "step": 15040 }, { "epoch": 7.093352192362094, "grad_norm": 0.3519817888736725, "learning_rate": 4.075713867993526e-05, "loss": 0.1161, "num_input_tokens_seen": 15151680, "step": 15045 }, { "epoch": 7.095709570957096, "grad_norm": 0.8424348831176758, "learning_rate": 4.0749151668987034e-05, "loss": 0.1263, "num_input_tokens_seen": 15156032, "step": 15050 }, { "epoch": 7.0980669495520985, "grad_norm": 0.2646331787109375, "learning_rate": 4.074116199197483e-05, "loss": 0.1409, "num_input_tokens_seen": 15160544, "step": 15055 }, { "epoch": 7.1004243281471, "grad_norm": 0.2093878835439682, "learning_rate": 4.073316965025118e-05, "loss": 0.1248, "num_input_tokens_seen": 15166432, "step": 15060 }, { "epoch": 7.102781706742102, "grad_norm": 0.5103663802146912, "learning_rate": 4.072517464516903e-05, "loss": 0.0617, "num_input_tokens_seen": 15171008, "step": 15065 }, { "epoch": 7.105139085337105, "grad_norm": 1.2124614715576172, "learning_rate": 4.071717697808182e-05, "loss": 0.1945, "num_input_tokens_seen": 15176672, "step": 15070 }, { "epoch": 7.107496463932107, "grad_norm": 1.2216492891311646, "learning_rate": 4.070917665034341e-05, "loss": 0.267, "num_input_tokens_seen": 15180928, "step": 15075 }, { "epoch": 7.10985384252711, "grad_norm": 0.48558664321899414, "learning_rate": 4.0701173663308115e-05, "loss": 0.1443, "num_input_tokens_seen": 15185824, "step": 15080 }, { "epoch": 7.112211221122112, "grad_norm": 0.9880908131599426, "learning_rate": 4.0693168018330705e-05, "loss": 0.0869, "num_input_tokens_seen": 15190080, "step": 15085 }, { "epoch": 7.114568599717114, "grad_norm": 0.16906960308551788, "learning_rate": 4.068515971676641e-05, "loss": 0.0774, "num_input_tokens_seen": 15194880, "step": 15090 }, { "epoch": 7.116925978312117, "grad_norm": 1.0967156887054443, "learning_rate": 4.06771487599709e-05, "loss": 0.0741, "num_input_tokens_seen": 15200800, "step": 15095 }, { "epoch": 7.119283356907119, "grad_norm": 0.3520171344280243, "learning_rate": 4.066913514930027e-05, "loss": 0.0498, "num_input_tokens_seen": 15205120, "step": 15100 }, { "epoch": 7.121640735502122, "grad_norm": 0.3895222246646881, "learning_rate": 4.066111888611112e-05, "loss": 0.0925, "num_input_tokens_seen": 15210304, "step": 15105 }, { "epoch": 7.123998114097124, "grad_norm": 1.817957878112793, "learning_rate": 4.0653099971760456e-05, "loss": 0.127, "num_input_tokens_seen": 15214944, "step": 15110 }, { "epoch": 7.126355492692126, "grad_norm": 1.612833857536316, "learning_rate": 4.064507840760573e-05, "loss": 0.1735, "num_input_tokens_seen": 15219552, "step": 15115 }, { "epoch": 7.128712871287129, "grad_norm": 1.12545645236969, "learning_rate": 4.063705419500487e-05, "loss": 0.1098, "num_input_tokens_seen": 15224032, "step": 15120 }, { "epoch": 7.131070249882131, "grad_norm": 0.0432245098054409, "learning_rate": 4.062902733531625e-05, "loss": 0.1292, "num_input_tokens_seen": 15229888, "step": 15125 }, { "epoch": 7.133427628477134, "grad_norm": 0.22802142798900604, "learning_rate": 4.062099782989866e-05, "loss": 0.0745, "num_input_tokens_seen": 15235392, "step": 15130 }, { "epoch": 7.135785007072136, "grad_norm": 1.075132131576538, "learning_rate": 4.061296568011136e-05, "loss": 0.0777, "num_input_tokens_seen": 15240928, "step": 15135 }, { "epoch": 7.138142385667138, "grad_norm": 1.4627821445465088, "learning_rate": 4.060493088731407e-05, "loss": 0.1502, "num_input_tokens_seen": 15246624, "step": 15140 }, { "epoch": 7.140499764262141, "grad_norm": 0.9638600945472717, "learning_rate": 4.0596893452866936e-05, "loss": 0.1193, "num_input_tokens_seen": 15251168, "step": 15145 }, { "epoch": 7.142857142857143, "grad_norm": 1.3717551231384277, "learning_rate": 4.0588853378130556e-05, "loss": 0.1396, "num_input_tokens_seen": 15255680, "step": 15150 }, { "epoch": 7.145214521452146, "grad_norm": 2.1771697998046875, "learning_rate": 4.058081066446599e-05, "loss": 0.2917, "num_input_tokens_seen": 15262656, "step": 15155 }, { "epoch": 7.147571900047147, "grad_norm": 0.1460159718990326, "learning_rate": 4.057276531323473e-05, "loss": 0.2527, "num_input_tokens_seen": 15267328, "step": 15160 }, { "epoch": 7.1499292786421496, "grad_norm": 0.10426990687847137, "learning_rate": 4.0564717325798695e-05, "loss": 0.1938, "num_input_tokens_seen": 15272992, "step": 15165 }, { "epoch": 7.152286657237152, "grad_norm": 0.5795240998268127, "learning_rate": 4.055666670352031e-05, "loss": 0.1754, "num_input_tokens_seen": 15280224, "step": 15170 }, { "epoch": 7.154644035832154, "grad_norm": 0.32477137446403503, "learning_rate": 4.0548613447762374e-05, "loss": 0.2123, "num_input_tokens_seen": 15284256, "step": 15175 }, { "epoch": 7.157001414427157, "grad_norm": 0.4300313889980316, "learning_rate": 4.0540557559888194e-05, "loss": 0.1316, "num_input_tokens_seen": 15288928, "step": 15180 }, { "epoch": 7.159358793022159, "grad_norm": 0.008376591838896275, "learning_rate": 4.053249904126147e-05, "loss": 0.0131, "num_input_tokens_seen": 15294528, "step": 15185 }, { "epoch": 7.161716171617162, "grad_norm": 0.15123796463012695, "learning_rate": 4.0524437893246396e-05, "loss": 0.2018, "num_input_tokens_seen": 15299776, "step": 15190 }, { "epoch": 7.164073550212164, "grad_norm": 2.0370748043060303, "learning_rate": 4.051637411720758e-05, "loss": 0.2938, "num_input_tokens_seen": 15304608, "step": 15195 }, { "epoch": 7.166430928807166, "grad_norm": 2.3220105171203613, "learning_rate": 4.050830771451007e-05, "loss": 0.3108, "num_input_tokens_seen": 15308864, "step": 15200 }, { "epoch": 7.168788307402169, "grad_norm": 0.5276109576225281, "learning_rate": 4.050023868651938e-05, "loss": 0.156, "num_input_tokens_seen": 15315488, "step": 15205 }, { "epoch": 7.171145685997171, "grad_norm": 1.426711916923523, "learning_rate": 4.049216703460147e-05, "loss": 0.1214, "num_input_tokens_seen": 15320704, "step": 15210 }, { "epoch": 7.173503064592174, "grad_norm": 0.058333396911621094, "learning_rate": 4.0484092760122704e-05, "loss": 0.0649, "num_input_tokens_seen": 15325504, "step": 15215 }, { "epoch": 7.175860443187176, "grad_norm": 1.5002456903457642, "learning_rate": 4.0476015864449955e-05, "loss": 0.26, "num_input_tokens_seen": 15329536, "step": 15220 }, { "epoch": 7.178217821782178, "grad_norm": 1.4921972751617432, "learning_rate": 4.046793634895048e-05, "loss": 0.1403, "num_input_tokens_seen": 15334336, "step": 15225 }, { "epoch": 7.180575200377181, "grad_norm": 0.6623348593711853, "learning_rate": 4.045985421499201e-05, "loss": 0.2873, "num_input_tokens_seen": 15338976, "step": 15230 }, { "epoch": 7.182932578972183, "grad_norm": 0.06298889219760895, "learning_rate": 4.045176946394273e-05, "loss": 0.1843, "num_input_tokens_seen": 15344544, "step": 15235 }, { "epoch": 7.185289957567186, "grad_norm": 0.32020118832588196, "learning_rate": 4.044368209717122e-05, "loss": 0.0807, "num_input_tokens_seen": 15350016, "step": 15240 }, { "epoch": 7.187647336162188, "grad_norm": 2.09521746635437, "learning_rate": 4.043559211604655e-05, "loss": 0.1111, "num_input_tokens_seen": 15354656, "step": 15245 }, { "epoch": 7.19000471475719, "grad_norm": 1.0426183938980103, "learning_rate": 4.042749952193822e-05, "loss": 0.0642, "num_input_tokens_seen": 15360096, "step": 15250 }, { "epoch": 7.192362093352193, "grad_norm": 1.866919994354248, "learning_rate": 4.041940431621617e-05, "loss": 0.2878, "num_input_tokens_seen": 15365344, "step": 15255 }, { "epoch": 7.194719471947194, "grad_norm": 1.0039055347442627, "learning_rate": 4.041130650025077e-05, "loss": 0.0628, "num_input_tokens_seen": 15370208, "step": 15260 }, { "epoch": 7.197076850542197, "grad_norm": 1.744048833847046, "learning_rate": 4.040320607541285e-05, "loss": 0.1788, "num_input_tokens_seen": 15375488, "step": 15265 }, { "epoch": 7.199434229137199, "grad_norm": 0.08481491357088089, "learning_rate": 4.039510304307367e-05, "loss": 0.1082, "num_input_tokens_seen": 15379808, "step": 15270 }, { "epoch": 7.2017916077322015, "grad_norm": 0.2615916132926941, "learning_rate": 4.038699740460494e-05, "loss": 0.1482, "num_input_tokens_seen": 15384544, "step": 15275 }, { "epoch": 7.204148986327204, "grad_norm": 0.38841554522514343, "learning_rate": 4.037888916137881e-05, "loss": 0.0274, "num_input_tokens_seen": 15390304, "step": 15280 }, { "epoch": 7.206506364922206, "grad_norm": 1.26982581615448, "learning_rate": 4.037077831476787e-05, "loss": 0.0968, "num_input_tokens_seen": 15394848, "step": 15285 }, { "epoch": 7.208863743517209, "grad_norm": 0.11694230884313583, "learning_rate": 4.036266486614513e-05, "loss": 0.1192, "num_input_tokens_seen": 15399392, "step": 15290 }, { "epoch": 7.211221122112211, "grad_norm": 0.6348636150360107, "learning_rate": 4.035454881688407e-05, "loss": 0.0479, "num_input_tokens_seen": 15403680, "step": 15295 }, { "epoch": 7.2135785007072135, "grad_norm": 0.04344325512647629, "learning_rate": 4.0346430168358606e-05, "loss": 0.1483, "num_input_tokens_seen": 15408096, "step": 15300 }, { "epoch": 7.215935879302216, "grad_norm": 0.32246699929237366, "learning_rate": 4.033830892194308e-05, "loss": 0.1886, "num_input_tokens_seen": 15412544, "step": 15305 }, { "epoch": 7.218293257897218, "grad_norm": 0.18177613615989685, "learning_rate": 4.033018507901229e-05, "loss": 0.0982, "num_input_tokens_seen": 15418016, "step": 15310 }, { "epoch": 7.220650636492221, "grad_norm": 1.4598270654678345, "learning_rate": 4.032205864094145e-05, "loss": 0.0925, "num_input_tokens_seen": 15422880, "step": 15315 }, { "epoch": 7.223008015087223, "grad_norm": 0.09990409761667252, "learning_rate": 4.031392960910624e-05, "loss": 0.1581, "num_input_tokens_seen": 15427264, "step": 15320 }, { "epoch": 7.2253653936822255, "grad_norm": 0.33932435512542725, "learning_rate": 4.030579798488276e-05, "loss": 0.122, "num_input_tokens_seen": 15432000, "step": 15325 }, { "epoch": 7.227722772277228, "grad_norm": 1.247087001800537, "learning_rate": 4.029766376964756e-05, "loss": 0.225, "num_input_tokens_seen": 15436896, "step": 15330 }, { "epoch": 7.23008015087223, "grad_norm": 2.323995351791382, "learning_rate": 4.028952696477763e-05, "loss": 0.0904, "num_input_tokens_seen": 15441440, "step": 15335 }, { "epoch": 7.232437529467233, "grad_norm": 0.631216287612915, "learning_rate": 4.0281387571650374e-05, "loss": 0.0818, "num_input_tokens_seen": 15446720, "step": 15340 }, { "epoch": 7.234794908062235, "grad_norm": 0.32180342078208923, "learning_rate": 4.027324559164367e-05, "loss": 0.0798, "num_input_tokens_seen": 15451200, "step": 15345 }, { "epoch": 7.2371522866572375, "grad_norm": 1.52275812625885, "learning_rate": 4.026510102613581e-05, "loss": 0.1573, "num_input_tokens_seen": 15456992, "step": 15350 }, { "epoch": 7.23950966525224, "grad_norm": 0.455061137676239, "learning_rate": 4.025695387650554e-05, "loss": 0.1597, "num_input_tokens_seen": 15461056, "step": 15355 }, { "epoch": 7.2418670438472414, "grad_norm": 1.479844093322754, "learning_rate": 4.0248804144132025e-05, "loss": 0.116, "num_input_tokens_seen": 15466208, "step": 15360 }, { "epoch": 7.244224422442244, "grad_norm": 0.03803533315658569, "learning_rate": 4.024065183039487e-05, "loss": 0.1462, "num_input_tokens_seen": 15470400, "step": 15365 }, { "epoch": 7.246581801037246, "grad_norm": 0.1094774529337883, "learning_rate": 4.023249693667414e-05, "loss": 0.1292, "num_input_tokens_seen": 15475168, "step": 15370 }, { "epoch": 7.248939179632249, "grad_norm": 0.4512087106704712, "learning_rate": 4.022433946435031e-05, "loss": 0.0743, "num_input_tokens_seen": 15479872, "step": 15375 }, { "epoch": 7.251296558227251, "grad_norm": 0.038815051317214966, "learning_rate": 4.0216179414804305e-05, "loss": 0.1084, "num_input_tokens_seen": 15485088, "step": 15380 }, { "epoch": 7.2536539368222535, "grad_norm": 0.9063851237297058, "learning_rate": 4.020801678941748e-05, "loss": 0.2372, "num_input_tokens_seen": 15490080, "step": 15385 }, { "epoch": 7.256011315417256, "grad_norm": 1.3820480108261108, "learning_rate": 4.0199851589571625e-05, "loss": 0.2452, "num_input_tokens_seen": 15495360, "step": 15390 }, { "epoch": 7.258368694012258, "grad_norm": 1.7095730304718018, "learning_rate": 4.0191683816648985e-05, "loss": 0.224, "num_input_tokens_seen": 15499840, "step": 15395 }, { "epoch": 7.260726072607261, "grad_norm": 0.5506250858306885, "learning_rate": 4.018351347203221e-05, "loss": 0.1918, "num_input_tokens_seen": 15504352, "step": 15400 }, { "epoch": 7.263083451202263, "grad_norm": 0.2309315800666809, "learning_rate": 4.01753405571044e-05, "loss": 0.1376, "num_input_tokens_seen": 15509504, "step": 15405 }, { "epoch": 7.2654408297972655, "grad_norm": 1.1711918115615845, "learning_rate": 4.01671650732491e-05, "loss": 0.1133, "num_input_tokens_seen": 15515296, "step": 15410 }, { "epoch": 7.267798208392268, "grad_norm": 0.8294540047645569, "learning_rate": 4.015898702185028e-05, "loss": 0.1389, "num_input_tokens_seen": 15520384, "step": 15415 }, { "epoch": 7.27015558698727, "grad_norm": 1.184577465057373, "learning_rate": 4.015080640429234e-05, "loss": 0.1256, "num_input_tokens_seen": 15524960, "step": 15420 }, { "epoch": 7.272512965582273, "grad_norm": 1.2374364137649536, "learning_rate": 4.0142623221960126e-05, "loss": 0.1924, "num_input_tokens_seen": 15530720, "step": 15425 }, { "epoch": 7.274870344177275, "grad_norm": 0.239906445145607, "learning_rate": 4.01344374762389e-05, "loss": 0.3297, "num_input_tokens_seen": 15534976, "step": 15430 }, { "epoch": 7.2772277227722775, "grad_norm": 0.07807737588882446, "learning_rate": 4.012624916851438e-05, "loss": 0.0801, "num_input_tokens_seen": 15541376, "step": 15435 }, { "epoch": 7.27958510136728, "grad_norm": 0.2942451238632202, "learning_rate": 4.0118058300172705e-05, "loss": 0.1214, "num_input_tokens_seen": 15546720, "step": 15440 }, { "epoch": 7.281942479962282, "grad_norm": 0.3533332049846649, "learning_rate": 4.0109864872600456e-05, "loss": 0.2079, "num_input_tokens_seen": 15552320, "step": 15445 }, { "epoch": 7.284299858557285, "grad_norm": 0.07038620114326477, "learning_rate": 4.010166888718463e-05, "loss": 0.127, "num_input_tokens_seen": 15556128, "step": 15450 }, { "epoch": 7.286657237152287, "grad_norm": 1.336058497428894, "learning_rate": 4.009347034531267e-05, "loss": 0.1153, "num_input_tokens_seen": 15560096, "step": 15455 }, { "epoch": 7.2890146157472895, "grad_norm": 1.063057541847229, "learning_rate": 4.008526924837245e-05, "loss": 0.0882, "num_input_tokens_seen": 15565824, "step": 15460 }, { "epoch": 7.291371994342291, "grad_norm": 1.1315377950668335, "learning_rate": 4.007706559775228e-05, "loss": 0.0995, "num_input_tokens_seen": 15570944, "step": 15465 }, { "epoch": 7.293729372937293, "grad_norm": 1.001462459564209, "learning_rate": 4.006885939484091e-05, "loss": 0.1556, "num_input_tokens_seen": 15575968, "step": 15470 }, { "epoch": 7.296086751532296, "grad_norm": 0.260393351316452, "learning_rate": 4.006065064102749e-05, "loss": 0.1866, "num_input_tokens_seen": 15580832, "step": 15475 }, { "epoch": 7.298444130127298, "grad_norm": 0.07144608348608017, "learning_rate": 4.0052439337701644e-05, "loss": 0.0836, "num_input_tokens_seen": 15585152, "step": 15480 }, { "epoch": 7.300801508722301, "grad_norm": 1.5598605871200562, "learning_rate": 4.004422548625338e-05, "loss": 0.293, "num_input_tokens_seen": 15589952, "step": 15485 }, { "epoch": 7.303158887317303, "grad_norm": 0.4486054480075836, "learning_rate": 4.003600908807318e-05, "loss": 0.0618, "num_input_tokens_seen": 15595968, "step": 15490 }, { "epoch": 7.305516265912305, "grad_norm": 0.12377447634935379, "learning_rate": 4.0027790144551935e-05, "loss": 0.0607, "num_input_tokens_seen": 15601152, "step": 15495 }, { "epoch": 7.307873644507308, "grad_norm": 0.38624852895736694, "learning_rate": 4.001956865708099e-05, "loss": 0.11, "num_input_tokens_seen": 15605440, "step": 15500 }, { "epoch": 7.31023102310231, "grad_norm": 0.8563739061355591, "learning_rate": 4.0011344627052075e-05, "loss": 0.1285, "num_input_tokens_seen": 15610496, "step": 15505 }, { "epoch": 7.312588401697313, "grad_norm": 0.2843989431858063, "learning_rate": 4.0003118055857404e-05, "loss": 0.2575, "num_input_tokens_seen": 15616160, "step": 15510 }, { "epoch": 7.314945780292315, "grad_norm": 1.5334528684616089, "learning_rate": 3.999488894488957e-05, "loss": 0.0724, "num_input_tokens_seen": 15620960, "step": 15515 }, { "epoch": 7.317303158887317, "grad_norm": 0.0337386280298233, "learning_rate": 3.998665729554165e-05, "loss": 0.173, "num_input_tokens_seen": 15625600, "step": 15520 }, { "epoch": 7.31966053748232, "grad_norm": 0.5541964769363403, "learning_rate": 3.99784231092071e-05, "loss": 0.0473, "num_input_tokens_seen": 15630304, "step": 15525 }, { "epoch": 7.322017916077322, "grad_norm": 0.13014543056488037, "learning_rate": 3.997018638727984e-05, "loss": 0.2229, "num_input_tokens_seen": 15635232, "step": 15530 }, { "epoch": 7.324375294672325, "grad_norm": 0.2840031683444977, "learning_rate": 3.996194713115421e-05, "loss": 0.1378, "num_input_tokens_seen": 15640576, "step": 15535 }, { "epoch": 7.326732673267327, "grad_norm": 0.29574933648109436, "learning_rate": 3.9953705342224964e-05, "loss": 0.0279, "num_input_tokens_seen": 15644640, "step": 15540 }, { "epoch": 7.329090051862329, "grad_norm": 0.8927000164985657, "learning_rate": 3.99454610218873e-05, "loss": 0.1876, "num_input_tokens_seen": 15651296, "step": 15545 }, { "epoch": 7.331447430457332, "grad_norm": 2.3046820163726807, "learning_rate": 3.993721417153685e-05, "loss": 0.1651, "num_input_tokens_seen": 15656160, "step": 15550 }, { "epoch": 7.333804809052333, "grad_norm": 0.6457992196083069, "learning_rate": 3.9928964792569655e-05, "loss": 0.1749, "num_input_tokens_seen": 15661216, "step": 15555 }, { "epoch": 7.336162187647336, "grad_norm": 2.1936843395233154, "learning_rate": 3.992071288638221e-05, "loss": 0.1413, "num_input_tokens_seen": 15667360, "step": 15560 }, { "epoch": 7.338519566242338, "grad_norm": 0.7491535544395447, "learning_rate": 3.99124584543714e-05, "loss": 0.2865, "num_input_tokens_seen": 15671840, "step": 15565 }, { "epoch": 7.3408769448373405, "grad_norm": 0.4628898501396179, "learning_rate": 3.990420149793458e-05, "loss": 0.2956, "num_input_tokens_seen": 15677312, "step": 15570 }, { "epoch": 7.343234323432343, "grad_norm": 0.05518012493848801, "learning_rate": 3.98959420184695e-05, "loss": 0.0663, "num_input_tokens_seen": 15681408, "step": 15575 }, { "epoch": 7.345591702027345, "grad_norm": 2.1607420444488525, "learning_rate": 3.988768001737436e-05, "loss": 0.2608, "num_input_tokens_seen": 15685984, "step": 15580 }, { "epoch": 7.347949080622348, "grad_norm": 0.6039687991142273, "learning_rate": 3.9879415496047774e-05, "loss": 0.088, "num_input_tokens_seen": 15690400, "step": 15585 }, { "epoch": 7.35030645921735, "grad_norm": 0.857852578163147, "learning_rate": 3.9871148455888775e-05, "loss": 0.1083, "num_input_tokens_seen": 15695936, "step": 15590 }, { "epoch": 7.3526638378123526, "grad_norm": 0.6978694796562195, "learning_rate": 3.986287889829685e-05, "loss": 0.1259, "num_input_tokens_seen": 15701184, "step": 15595 }, { "epoch": 7.355021216407355, "grad_norm": 0.20569147169589996, "learning_rate": 3.985460682467187e-05, "loss": 0.1399, "num_input_tokens_seen": 15705760, "step": 15600 }, { "epoch": 7.357378595002357, "grad_norm": 0.7580237984657288, "learning_rate": 3.9846332236414186e-05, "loss": 0.1596, "num_input_tokens_seen": 15710592, "step": 15605 }, { "epoch": 7.35973597359736, "grad_norm": 1.9150859117507935, "learning_rate": 3.9838055134924526e-05, "loss": 0.0991, "num_input_tokens_seen": 15714912, "step": 15610 }, { "epoch": 7.362093352192362, "grad_norm": 0.6993288993835449, "learning_rate": 3.982977552160406e-05, "loss": 0.1805, "num_input_tokens_seen": 15720032, "step": 15615 }, { "epoch": 7.364450730787365, "grad_norm": 0.16823549568653107, "learning_rate": 3.98214933978544e-05, "loss": 0.1608, "num_input_tokens_seen": 15725184, "step": 15620 }, { "epoch": 7.366808109382367, "grad_norm": 1.8885544538497925, "learning_rate": 3.981320876507757e-05, "loss": 0.1788, "num_input_tokens_seen": 15730208, "step": 15625 }, { "epoch": 7.369165487977369, "grad_norm": 0.5003657341003418, "learning_rate": 3.9804921624676e-05, "loss": 0.1576, "num_input_tokens_seen": 15734944, "step": 15630 }, { "epoch": 7.371522866572372, "grad_norm": 0.1056181862950325, "learning_rate": 3.9796631978052575e-05, "loss": 0.1038, "num_input_tokens_seen": 15739648, "step": 15635 }, { "epoch": 7.373880245167374, "grad_norm": 0.9373593330383301, "learning_rate": 3.978833982661059e-05, "loss": 0.2457, "num_input_tokens_seen": 15744544, "step": 15640 }, { "epoch": 7.376237623762377, "grad_norm": 0.06322495639324188, "learning_rate": 3.9780045171753764e-05, "loss": 0.097, "num_input_tokens_seen": 15749344, "step": 15645 }, { "epoch": 7.378595002357379, "grad_norm": 0.27326270937919617, "learning_rate": 3.977174801488624e-05, "loss": 0.3234, "num_input_tokens_seen": 15755488, "step": 15650 }, { "epoch": 7.380952380952381, "grad_norm": 0.20233136415481567, "learning_rate": 3.976344835741258e-05, "loss": 0.1286, "num_input_tokens_seen": 15760224, "step": 15655 }, { "epoch": 7.383309759547384, "grad_norm": 0.3264143466949463, "learning_rate": 3.975514620073779e-05, "loss": 0.0586, "num_input_tokens_seen": 15764512, "step": 15660 }, { "epoch": 7.385667138142385, "grad_norm": 1.7813488245010376, "learning_rate": 3.974684154626728e-05, "loss": 0.2617, "num_input_tokens_seen": 15769760, "step": 15665 }, { "epoch": 7.388024516737388, "grad_norm": 0.7133809328079224, "learning_rate": 3.973853439540688e-05, "loss": 0.0945, "num_input_tokens_seen": 15774656, "step": 15670 }, { "epoch": 7.39038189533239, "grad_norm": 0.6659950017929077, "learning_rate": 3.973022474956285e-05, "loss": 0.1161, "num_input_tokens_seen": 15779008, "step": 15675 }, { "epoch": 7.3927392739273925, "grad_norm": 0.4239707887172699, "learning_rate": 3.972191261014188e-05, "loss": 0.0939, "num_input_tokens_seen": 15784032, "step": 15680 }, { "epoch": 7.395096652522395, "grad_norm": 0.31810158491134644, "learning_rate": 3.9713597978551066e-05, "loss": 0.0357, "num_input_tokens_seen": 15789952, "step": 15685 }, { "epoch": 7.397454031117397, "grad_norm": 1.5871548652648926, "learning_rate": 3.970528085619794e-05, "loss": 0.1526, "num_input_tokens_seen": 15794048, "step": 15690 }, { "epoch": 7.3998114097124, "grad_norm": 0.25589191913604736, "learning_rate": 3.969696124449044e-05, "loss": 0.0934, "num_input_tokens_seen": 15799456, "step": 15695 }, { "epoch": 7.402168788307402, "grad_norm": 0.10457839071750641, "learning_rate": 3.968863914483693e-05, "loss": 0.1602, "num_input_tokens_seen": 15804576, "step": 15700 }, { "epoch": 7.4045261669024045, "grad_norm": 0.06897854059934616, "learning_rate": 3.9680314558646233e-05, "loss": 0.0469, "num_input_tokens_seen": 15809952, "step": 15705 }, { "epoch": 7.406883545497407, "grad_norm": 0.17012429237365723, "learning_rate": 3.967198748732753e-05, "loss": 0.1553, "num_input_tokens_seen": 15814528, "step": 15710 }, { "epoch": 7.409240924092409, "grad_norm": 0.6096392273902893, "learning_rate": 3.9663657932290466e-05, "loss": 0.1492, "num_input_tokens_seen": 15819488, "step": 15715 }, { "epoch": 7.411598302687412, "grad_norm": 2.2790184020996094, "learning_rate": 3.965532589494508e-05, "loss": 0.1072, "num_input_tokens_seen": 15824160, "step": 15720 }, { "epoch": 7.413955681282414, "grad_norm": 1.2059504985809326, "learning_rate": 3.964699137670186e-05, "loss": 0.1172, "num_input_tokens_seen": 15828384, "step": 15725 }, { "epoch": 7.4163130598774165, "grad_norm": 0.2866685092449188, "learning_rate": 3.963865437897169e-05, "loss": 0.1499, "num_input_tokens_seen": 15833920, "step": 15730 }, { "epoch": 7.418670438472419, "grad_norm": 0.18698640167713165, "learning_rate": 3.9630314903165886e-05, "loss": 0.1464, "num_input_tokens_seen": 15838496, "step": 15735 }, { "epoch": 7.421027817067421, "grad_norm": 0.17140188813209534, "learning_rate": 3.9621972950696185e-05, "loss": 0.0288, "num_input_tokens_seen": 15844096, "step": 15740 }, { "epoch": 7.423385195662424, "grad_norm": 0.9259467720985413, "learning_rate": 3.961362852297472e-05, "loss": 0.1333, "num_input_tokens_seen": 15848704, "step": 15745 }, { "epoch": 7.425742574257426, "grad_norm": 0.1617744117975235, "learning_rate": 3.960528162141407e-05, "loss": 0.1358, "num_input_tokens_seen": 15853792, "step": 15750 }, { "epoch": 7.428099952852428, "grad_norm": 0.3698999285697937, "learning_rate": 3.959693224742723e-05, "loss": 0.1404, "num_input_tokens_seen": 15858848, "step": 15755 }, { "epoch": 7.43045733144743, "grad_norm": 0.19528426229953766, "learning_rate": 3.95885804024276e-05, "loss": 0.1876, "num_input_tokens_seen": 15864032, "step": 15760 }, { "epoch": 7.432814710042432, "grad_norm": 0.2027839571237564, "learning_rate": 3.958022608782901e-05, "loss": 0.165, "num_input_tokens_seen": 15868832, "step": 15765 }, { "epoch": 7.435172088637435, "grad_norm": 0.23390038311481476, "learning_rate": 3.95718693050457e-05, "loss": 0.0957, "num_input_tokens_seen": 15873344, "step": 15770 }, { "epoch": 7.437529467232437, "grad_norm": 0.30654945969581604, "learning_rate": 3.9563510055492336e-05, "loss": 0.0764, "num_input_tokens_seen": 15878368, "step": 15775 }, { "epoch": 7.43988684582744, "grad_norm": 0.527228057384491, "learning_rate": 3.9555148340583994e-05, "loss": 0.0591, "num_input_tokens_seen": 15884064, "step": 15780 }, { "epoch": 7.442244224422442, "grad_norm": 1.0618079900741577, "learning_rate": 3.954678416173616e-05, "loss": 0.1305, "num_input_tokens_seen": 15891200, "step": 15785 }, { "epoch": 7.4446016030174444, "grad_norm": 1.5685465335845947, "learning_rate": 3.9538417520364765e-05, "loss": 0.2691, "num_input_tokens_seen": 15895584, "step": 15790 }, { "epoch": 7.446958981612447, "grad_norm": 0.9990586638450623, "learning_rate": 3.953004841788613e-05, "loss": 0.138, "num_input_tokens_seen": 15900544, "step": 15795 }, { "epoch": 7.449316360207449, "grad_norm": 1.6993809938430786, "learning_rate": 3.9521676855717006e-05, "loss": 0.2289, "num_input_tokens_seen": 15904640, "step": 15800 }, { "epoch": 7.451673738802452, "grad_norm": 2.1274006366729736, "learning_rate": 3.9513302835274557e-05, "loss": 0.2135, "num_input_tokens_seen": 15910656, "step": 15805 }, { "epoch": 7.454031117397454, "grad_norm": 0.18265892565250397, "learning_rate": 3.950492635797636e-05, "loss": 0.0574, "num_input_tokens_seen": 15914912, "step": 15810 }, { "epoch": 7.4563884959924565, "grad_norm": 1.011342167854309, "learning_rate": 3.94965474252404e-05, "loss": 0.0768, "num_input_tokens_seen": 15920256, "step": 15815 }, { "epoch": 7.458745874587459, "grad_norm": 0.39590826630592346, "learning_rate": 3.948816603848511e-05, "loss": 0.1448, "num_input_tokens_seen": 15924928, "step": 15820 }, { "epoch": 7.461103253182461, "grad_norm": 0.07313279807567596, "learning_rate": 3.94797821991293e-05, "loss": 0.1566, "num_input_tokens_seen": 15929792, "step": 15825 }, { "epoch": 7.463460631777464, "grad_norm": 1.50905442237854, "learning_rate": 3.9471395908592214e-05, "loss": 0.0911, "num_input_tokens_seen": 15934976, "step": 15830 }, { "epoch": 7.465818010372466, "grad_norm": 1.0157240629196167, "learning_rate": 3.946300716829352e-05, "loss": 0.0864, "num_input_tokens_seen": 15939936, "step": 15835 }, { "epoch": 7.4681753889674685, "grad_norm": 0.19750148057937622, "learning_rate": 3.945461597965327e-05, "loss": 0.0277, "num_input_tokens_seen": 15943840, "step": 15840 }, { "epoch": 7.470532767562471, "grad_norm": 0.1474730372428894, "learning_rate": 3.9446222344091965e-05, "loss": 0.1222, "num_input_tokens_seen": 15948800, "step": 15845 }, { "epoch": 7.472890146157473, "grad_norm": 1.5270235538482666, "learning_rate": 3.94378262630305e-05, "loss": 0.1243, "num_input_tokens_seen": 15952800, "step": 15850 }, { "epoch": 7.475247524752476, "grad_norm": 1.8799368143081665, "learning_rate": 3.942942773789019e-05, "loss": 0.091, "num_input_tokens_seen": 15957472, "step": 15855 }, { "epoch": 7.477604903347478, "grad_norm": 1.3833738565444946, "learning_rate": 3.9421026770092756e-05, "loss": 0.1419, "num_input_tokens_seen": 15963424, "step": 15860 }, { "epoch": 7.47996228194248, "grad_norm": 0.7096436619758606, "learning_rate": 3.941262336106035e-05, "loss": 0.2388, "num_input_tokens_seen": 15968288, "step": 15865 }, { "epoch": 7.482319660537482, "grad_norm": 0.3657852113246918, "learning_rate": 3.9404217512215525e-05, "loss": 0.0816, "num_input_tokens_seen": 15972416, "step": 15870 }, { "epoch": 7.484677039132484, "grad_norm": 0.08761603385210037, "learning_rate": 3.939580922498124e-05, "loss": 0.0451, "num_input_tokens_seen": 15977600, "step": 15875 }, { "epoch": 7.487034417727487, "grad_norm": 1.1589741706848145, "learning_rate": 3.9387398500780884e-05, "loss": 0.3321, "num_input_tokens_seen": 15983744, "step": 15880 }, { "epoch": 7.489391796322489, "grad_norm": 0.6374592185020447, "learning_rate": 3.9378985341038235e-05, "loss": 0.0432, "num_input_tokens_seen": 15988800, "step": 15885 }, { "epoch": 7.491749174917492, "grad_norm": 1.0483925342559814, "learning_rate": 3.9370569747177514e-05, "loss": 0.1854, "num_input_tokens_seen": 15994080, "step": 15890 }, { "epoch": 7.494106553512494, "grad_norm": 1.1843730211257935, "learning_rate": 3.9362151720623336e-05, "loss": 0.2173, "num_input_tokens_seen": 15999328, "step": 15895 }, { "epoch": 7.496463932107496, "grad_norm": 1.133039951324463, "learning_rate": 3.935373126280073e-05, "loss": 0.1905, "num_input_tokens_seen": 16005408, "step": 15900 }, { "epoch": 7.498821310702499, "grad_norm": 0.7180706858634949, "learning_rate": 3.934530837513512e-05, "loss": 0.1595, "num_input_tokens_seen": 16010592, "step": 15905 }, { "epoch": 7.501178689297501, "grad_norm": 0.0676107332110405, "learning_rate": 3.9336883059052385e-05, "loss": 0.0857, "num_input_tokens_seen": 16015296, "step": 15910 }, { "epoch": 7.503536067892504, "grad_norm": 1.4460325241088867, "learning_rate": 3.9328455315978775e-05, "loss": 0.1869, "num_input_tokens_seen": 16020704, "step": 15915 }, { "epoch": 7.505893446487506, "grad_norm": 0.21128220856189728, "learning_rate": 3.932002514734096e-05, "loss": 0.0514, "num_input_tokens_seen": 16025728, "step": 15920 }, { "epoch": 7.508250825082508, "grad_norm": 0.019484542310237885, "learning_rate": 3.9311592554566026e-05, "loss": 0.0984, "num_input_tokens_seen": 16031072, "step": 15925 }, { "epoch": 7.510608203677511, "grad_norm": 1.0616002082824707, "learning_rate": 3.930315753908147e-05, "loss": 0.0573, "num_input_tokens_seen": 16035040, "step": 15930 }, { "epoch": 7.512965582272513, "grad_norm": 0.2787204086780548, "learning_rate": 3.92947201023152e-05, "loss": 0.1264, "num_input_tokens_seen": 16041760, "step": 15935 }, { "epoch": 7.515322960867516, "grad_norm": 1.9420509338378906, "learning_rate": 3.928628024569552e-05, "loss": 0.1933, "num_input_tokens_seen": 16046560, "step": 15940 }, { "epoch": 7.517680339462518, "grad_norm": 0.5261356234550476, "learning_rate": 3.9277837970651165e-05, "loss": 0.1998, "num_input_tokens_seen": 16052704, "step": 15945 }, { "epoch": 7.52003771805752, "grad_norm": 0.14455127716064453, "learning_rate": 3.926939327861127e-05, "loss": 0.1095, "num_input_tokens_seen": 16057312, "step": 15950 }, { "epoch": 7.522395096652522, "grad_norm": 2.3283908367156982, "learning_rate": 3.926094617100537e-05, "loss": 0.314, "num_input_tokens_seen": 16062048, "step": 15955 }, { "epoch": 7.524752475247524, "grad_norm": 1.6163054704666138, "learning_rate": 3.925249664926342e-05, "loss": 0.1201, "num_input_tokens_seen": 16066656, "step": 15960 }, { "epoch": 7.527109853842527, "grad_norm": 1.9600218534469604, "learning_rate": 3.924404471481578e-05, "loss": 0.1327, "num_input_tokens_seen": 16071264, "step": 15965 }, { "epoch": 7.529467232437529, "grad_norm": 0.25390586256980896, "learning_rate": 3.9235590369093225e-05, "loss": 0.0728, "num_input_tokens_seen": 16077632, "step": 15970 }, { "epoch": 7.5318246110325315, "grad_norm": 0.21172672510147095, "learning_rate": 3.922713361352692e-05, "loss": 0.1127, "num_input_tokens_seen": 16082208, "step": 15975 }, { "epoch": 7.534181989627534, "grad_norm": 0.6384190320968628, "learning_rate": 3.9218674449548466e-05, "loss": 0.0583, "num_input_tokens_seen": 16086464, "step": 15980 }, { "epoch": 7.536539368222536, "grad_norm": 0.20826774835586548, "learning_rate": 3.921021287858984e-05, "loss": 0.0912, "num_input_tokens_seen": 16092000, "step": 15985 }, { "epoch": 7.538896746817539, "grad_norm": 1.0100358724594116, "learning_rate": 3.920174890208345e-05, "loss": 0.2073, "num_input_tokens_seen": 16096352, "step": 15990 }, { "epoch": 7.541254125412541, "grad_norm": 0.5367146730422974, "learning_rate": 3.9193282521462106e-05, "loss": 0.1266, "num_input_tokens_seen": 16101664, "step": 15995 }, { "epoch": 7.5436115040075435, "grad_norm": 1.0561410188674927, "learning_rate": 3.918481373815901e-05, "loss": 0.1457, "num_input_tokens_seen": 16107040, "step": 16000 }, { "epoch": 7.545968882602546, "grad_norm": 1.2720929384231567, "learning_rate": 3.9176342553607806e-05, "loss": 0.0835, "num_input_tokens_seen": 16111744, "step": 16005 }, { "epoch": 7.548326261197548, "grad_norm": 1.2813082933425903, "learning_rate": 3.91678689692425e-05, "loss": 0.2402, "num_input_tokens_seen": 16116512, "step": 16010 }, { "epoch": 7.550683639792551, "grad_norm": 0.6698370575904846, "learning_rate": 3.915939298649753e-05, "loss": 0.0548, "num_input_tokens_seen": 16121792, "step": 16015 }, { "epoch": 7.553041018387553, "grad_norm": 0.36145392060279846, "learning_rate": 3.915091460680775e-05, "loss": 0.1599, "num_input_tokens_seen": 16126816, "step": 16020 }, { "epoch": 7.5553983969825556, "grad_norm": 1.5102876424789429, "learning_rate": 3.914243383160839e-05, "loss": 0.1632, "num_input_tokens_seen": 16131744, "step": 16025 }, { "epoch": 7.557755775577558, "grad_norm": 0.08754616975784302, "learning_rate": 3.9133950662335114e-05, "loss": 0.101, "num_input_tokens_seen": 16136544, "step": 16030 }, { "epoch": 7.56011315417256, "grad_norm": 0.13715112209320068, "learning_rate": 3.912546510042396e-05, "loss": 0.0664, "num_input_tokens_seen": 16141888, "step": 16035 }, { "epoch": 7.562470532767563, "grad_norm": 0.6085160374641418, "learning_rate": 3.911697714731141e-05, "loss": 0.0982, "num_input_tokens_seen": 16146688, "step": 16040 }, { "epoch": 7.564827911362565, "grad_norm": 1.407800316810608, "learning_rate": 3.910848680443433e-05, "loss": 0.2252, "num_input_tokens_seen": 16151424, "step": 16045 }, { "epoch": 7.567185289957568, "grad_norm": 0.28447064757347107, "learning_rate": 3.909999407322996e-05, "loss": 0.0476, "num_input_tokens_seen": 16156192, "step": 16050 }, { "epoch": 7.56954266855257, "grad_norm": 1.054975986480713, "learning_rate": 3.909149895513602e-05, "loss": 0.0594, "num_input_tokens_seen": 16161120, "step": 16055 }, { "epoch": 7.571900047147572, "grad_norm": 0.3502075970172882, "learning_rate": 3.908300145159055e-05, "loss": 0.0728, "num_input_tokens_seen": 16166080, "step": 16060 }, { "epoch": 7.574257425742574, "grad_norm": 1.619117021560669, "learning_rate": 3.907450156403206e-05, "loss": 0.1246, "num_input_tokens_seen": 16170912, "step": 16065 }, { "epoch": 7.576614804337576, "grad_norm": 0.345877081155777, "learning_rate": 3.9065999293899425e-05, "loss": 0.1532, "num_input_tokens_seen": 16176160, "step": 16070 }, { "epoch": 7.578972182932579, "grad_norm": 0.16824743151664734, "learning_rate": 3.905749464263193e-05, "loss": 0.0748, "num_input_tokens_seen": 16181728, "step": 16075 }, { "epoch": 7.581329561527581, "grad_norm": 0.17622290551662445, "learning_rate": 3.904898761166929e-05, "loss": 0.047, "num_input_tokens_seen": 16186752, "step": 16080 }, { "epoch": 7.5836869401225835, "grad_norm": 0.27504709362983704, "learning_rate": 3.904047820245157e-05, "loss": 0.1347, "num_input_tokens_seen": 16191296, "step": 16085 }, { "epoch": 7.586044318717586, "grad_norm": 0.2863938510417938, "learning_rate": 3.903196641641929e-05, "loss": 0.1184, "num_input_tokens_seen": 16195680, "step": 16090 }, { "epoch": 7.588401697312588, "grad_norm": 1.4284926652908325, "learning_rate": 3.902345225501334e-05, "loss": 0.1459, "num_input_tokens_seen": 16201664, "step": 16095 }, { "epoch": 7.590759075907591, "grad_norm": 0.06902270019054413, "learning_rate": 3.901493571967504e-05, "loss": 0.1589, "num_input_tokens_seen": 16206496, "step": 16100 }, { "epoch": 7.593116454502593, "grad_norm": 0.6306257247924805, "learning_rate": 3.900641681184607e-05, "loss": 0.0567, "num_input_tokens_seen": 16212288, "step": 16105 }, { "epoch": 7.5954738330975955, "grad_norm": 0.2236207276582718, "learning_rate": 3.8997895532968554e-05, "loss": 0.0862, "num_input_tokens_seen": 16217952, "step": 16110 }, { "epoch": 7.597831211692598, "grad_norm": 0.877753734588623, "learning_rate": 3.8989371884484994e-05, "loss": 0.226, "num_input_tokens_seen": 16222560, "step": 16115 }, { "epoch": 7.6001885902876, "grad_norm": 0.8944889903068542, "learning_rate": 3.898084586783831e-05, "loss": 0.129, "num_input_tokens_seen": 16227232, "step": 16120 }, { "epoch": 7.602545968882603, "grad_norm": 0.4118766784667969, "learning_rate": 3.897231748447179e-05, "loss": 0.0339, "num_input_tokens_seen": 16232128, "step": 16125 }, { "epoch": 7.604903347477605, "grad_norm": 0.14654846489429474, "learning_rate": 3.896378673582915e-05, "loss": 0.0666, "num_input_tokens_seen": 16237120, "step": 16130 }, { "epoch": 7.6072607260726075, "grad_norm": 1.4460054636001587, "learning_rate": 3.895525362335452e-05, "loss": 0.1786, "num_input_tokens_seen": 16242080, "step": 16135 }, { "epoch": 7.60961810466761, "grad_norm": 1.0029840469360352, "learning_rate": 3.89467181484924e-05, "loss": 0.1475, "num_input_tokens_seen": 16246368, "step": 16140 }, { "epoch": 7.611975483262612, "grad_norm": 0.6328390836715698, "learning_rate": 3.8938180312687686e-05, "loss": 0.1224, "num_input_tokens_seen": 16251584, "step": 16145 }, { "epoch": 7.614332861857615, "grad_norm": 1.05900239944458, "learning_rate": 3.892964011738571e-05, "loss": 0.2192, "num_input_tokens_seen": 16255552, "step": 16150 }, { "epoch": 7.616690240452616, "grad_norm": 0.4488602876663208, "learning_rate": 3.892109756403218e-05, "loss": 0.2286, "num_input_tokens_seen": 16260960, "step": 16155 }, { "epoch": 7.619047619047619, "grad_norm": 0.528354823589325, "learning_rate": 3.891255265407319e-05, "loss": 0.0739, "num_input_tokens_seen": 16265728, "step": 16160 }, { "epoch": 7.621404997642621, "grad_norm": 0.26036983728408813, "learning_rate": 3.890400538895526e-05, "loss": 0.0848, "num_input_tokens_seen": 16269312, "step": 16165 }, { "epoch": 7.623762376237623, "grad_norm": 1.2830530405044556, "learning_rate": 3.88954557701253e-05, "loss": 0.1851, "num_input_tokens_seen": 16273152, "step": 16170 }, { "epoch": 7.626119754832626, "grad_norm": 0.10953973233699799, "learning_rate": 3.88869037990306e-05, "loss": 0.0351, "num_input_tokens_seen": 16278112, "step": 16175 }, { "epoch": 7.628477133427628, "grad_norm": 1.5406455993652344, "learning_rate": 3.887834947711888e-05, "loss": 0.1095, "num_input_tokens_seen": 16282688, "step": 16180 }, { "epoch": 7.630834512022631, "grad_norm": 1.58681321144104, "learning_rate": 3.886979280583823e-05, "loss": 0.1023, "num_input_tokens_seen": 16287136, "step": 16185 }, { "epoch": 7.633191890617633, "grad_norm": 0.450340211391449, "learning_rate": 3.886123378663715e-05, "loss": 0.1471, "num_input_tokens_seen": 16292480, "step": 16190 }, { "epoch": 7.635549269212635, "grad_norm": 0.39348316192626953, "learning_rate": 3.885267242096455e-05, "loss": 0.0491, "num_input_tokens_seen": 16297728, "step": 16195 }, { "epoch": 7.637906647807638, "grad_norm": 1.7117124795913696, "learning_rate": 3.884410871026971e-05, "loss": 0.096, "num_input_tokens_seen": 16301728, "step": 16200 }, { "epoch": 7.64026402640264, "grad_norm": 1.5730764865875244, "learning_rate": 3.8835542656002324e-05, "loss": 0.2611, "num_input_tokens_seen": 16307616, "step": 16205 }, { "epoch": 7.642621404997643, "grad_norm": 0.1638493537902832, "learning_rate": 3.882697425961248e-05, "loss": 0.0886, "num_input_tokens_seen": 16313056, "step": 16210 }, { "epoch": 7.644978783592645, "grad_norm": 1.1667380332946777, "learning_rate": 3.8818403522550674e-05, "loss": 0.1556, "num_input_tokens_seen": 16317920, "step": 16215 }, { "epoch": 7.6473361621876474, "grad_norm": 2.2609474658966064, "learning_rate": 3.8809830446267756e-05, "loss": 0.1539, "num_input_tokens_seen": 16322592, "step": 16220 }, { "epoch": 7.64969354078265, "grad_norm": 0.8842202425003052, "learning_rate": 3.8801255032215036e-05, "loss": 0.1537, "num_input_tokens_seen": 16327648, "step": 16225 }, { "epoch": 7.652050919377652, "grad_norm": 0.42470067739486694, "learning_rate": 3.879267728184416e-05, "loss": 0.0469, "num_input_tokens_seen": 16332768, "step": 16230 }, { "epoch": 7.654408297972655, "grad_norm": 1.8970335721969604, "learning_rate": 3.8784097196607215e-05, "loss": 0.1446, "num_input_tokens_seen": 16336576, "step": 16235 }, { "epoch": 7.656765676567657, "grad_norm": 0.22374287247657776, "learning_rate": 3.8775514777956645e-05, "loss": 0.0321, "num_input_tokens_seen": 16341472, "step": 16240 }, { "epoch": 7.6591230551626595, "grad_norm": 1.0525429248809814, "learning_rate": 3.8766930027345326e-05, "loss": 0.2432, "num_input_tokens_seen": 16347264, "step": 16245 }, { "epoch": 7.661480433757662, "grad_norm": 0.6523619890213013, "learning_rate": 3.87583429462265e-05, "loss": 0.104, "num_input_tokens_seen": 16351712, "step": 16250 }, { "epoch": 7.663837812352664, "grad_norm": 0.38953208923339844, "learning_rate": 3.8749753536053824e-05, "loss": 0.0912, "num_input_tokens_seen": 16356512, "step": 16255 }, { "epoch": 7.666195190947667, "grad_norm": 0.14694362878799438, "learning_rate": 3.874116179828132e-05, "loss": 0.1071, "num_input_tokens_seen": 16362560, "step": 16260 }, { "epoch": 7.668552569542668, "grad_norm": 0.18090958893299103, "learning_rate": 3.873256773436344e-05, "loss": 0.1191, "num_input_tokens_seen": 16367520, "step": 16265 }, { "epoch": 7.670909948137671, "grad_norm": 0.05484093353152275, "learning_rate": 3.8723971345755005e-05, "loss": 0.1967, "num_input_tokens_seen": 16372800, "step": 16270 }, { "epoch": 7.673267326732673, "grad_norm": 1.3144482374191284, "learning_rate": 3.8715372633911254e-05, "loss": 0.1319, "num_input_tokens_seen": 16378272, "step": 16275 }, { "epoch": 7.675624705327675, "grad_norm": 0.9630619287490845, "learning_rate": 3.870677160028777e-05, "loss": 0.0782, "num_input_tokens_seen": 16382944, "step": 16280 }, { "epoch": 7.677982083922678, "grad_norm": 1.7578517198562622, "learning_rate": 3.86981682463406e-05, "loss": 0.1908, "num_input_tokens_seen": 16387936, "step": 16285 }, { "epoch": 7.68033946251768, "grad_norm": 0.09639333933591843, "learning_rate": 3.8689562573526116e-05, "loss": 0.0919, "num_input_tokens_seen": 16392896, "step": 16290 }, { "epoch": 7.682696841112683, "grad_norm": 0.44776293635368347, "learning_rate": 3.868095458330113e-05, "loss": 0.2033, "num_input_tokens_seen": 16398688, "step": 16295 }, { "epoch": 7.685054219707685, "grad_norm": 0.8211609125137329, "learning_rate": 3.867234427712282e-05, "loss": 0.1452, "num_input_tokens_seen": 16404032, "step": 16300 }, { "epoch": 7.687411598302687, "grad_norm": 0.06479299813508987, "learning_rate": 3.8663731656448764e-05, "loss": 0.1178, "num_input_tokens_seen": 16408992, "step": 16305 }, { "epoch": 7.68976897689769, "grad_norm": 0.499746173620224, "learning_rate": 3.865511672273694e-05, "loss": 0.0321, "num_input_tokens_seen": 16414688, "step": 16310 }, { "epoch": 7.692126355492692, "grad_norm": 0.6811588406562805, "learning_rate": 3.864649947744571e-05, "loss": 0.1934, "num_input_tokens_seen": 16419456, "step": 16315 }, { "epoch": 7.694483734087695, "grad_norm": 1.9901353120803833, "learning_rate": 3.863787992203381e-05, "loss": 0.1363, "num_input_tokens_seen": 16424160, "step": 16320 }, { "epoch": 7.696841112682697, "grad_norm": 0.6889695525169373, "learning_rate": 3.8629258057960416e-05, "loss": 0.1531, "num_input_tokens_seen": 16429792, "step": 16325 }, { "epoch": 7.699198491277699, "grad_norm": 0.5513397455215454, "learning_rate": 3.862063388668503e-05, "loss": 0.0933, "num_input_tokens_seen": 16434976, "step": 16330 }, { "epoch": 7.701555869872702, "grad_norm": 2.086420774459839, "learning_rate": 3.8612007409667595e-05, "loss": 0.2653, "num_input_tokens_seen": 16440640, "step": 16335 }, { "epoch": 7.703913248467704, "grad_norm": 0.7365462183952332, "learning_rate": 3.860337862836843e-05, "loss": 0.1044, "num_input_tokens_seen": 16446144, "step": 16340 }, { "epoch": 7.706270627062707, "grad_norm": 0.7038167119026184, "learning_rate": 3.859474754424822e-05, "loss": 0.2023, "num_input_tokens_seen": 16450720, "step": 16345 }, { "epoch": 7.708628005657709, "grad_norm": 1.9488716125488281, "learning_rate": 3.858611415876809e-05, "loss": 0.176, "num_input_tokens_seen": 16455904, "step": 16350 }, { "epoch": 7.7109853842527105, "grad_norm": 0.3422386050224304, "learning_rate": 3.8577478473389504e-05, "loss": 0.1995, "num_input_tokens_seen": 16460832, "step": 16355 }, { "epoch": 7.713342762847713, "grad_norm": 0.5104775428771973, "learning_rate": 3.8568840489574356e-05, "loss": 0.2152, "num_input_tokens_seen": 16465920, "step": 16360 }, { "epoch": 7.715700141442715, "grad_norm": 1.936292052268982, "learning_rate": 3.856020020878489e-05, "loss": 0.3332, "num_input_tokens_seen": 16470432, "step": 16365 }, { "epoch": 7.718057520037718, "grad_norm": 0.4671347439289093, "learning_rate": 3.855155763248377e-05, "loss": 0.0749, "num_input_tokens_seen": 16474464, "step": 16370 }, { "epoch": 7.72041489863272, "grad_norm": 1.187242865562439, "learning_rate": 3.8542912762134035e-05, "loss": 0.0666, "num_input_tokens_seen": 16478848, "step": 16375 }, { "epoch": 7.7227722772277225, "grad_norm": 0.9806304574012756, "learning_rate": 3.853426559919911e-05, "loss": 0.1661, "num_input_tokens_seen": 16483840, "step": 16380 }, { "epoch": 7.725129655822725, "grad_norm": 0.11352197825908661, "learning_rate": 3.852561614514282e-05, "loss": 0.0191, "num_input_tokens_seen": 16489248, "step": 16385 }, { "epoch": 7.727487034417727, "grad_norm": 0.7547784447669983, "learning_rate": 3.851696440142937e-05, "loss": 0.1026, "num_input_tokens_seen": 16494912, "step": 16390 }, { "epoch": 7.72984441301273, "grad_norm": 0.36608967185020447, "learning_rate": 3.850831036952336e-05, "loss": 0.0329, "num_input_tokens_seen": 16499520, "step": 16395 }, { "epoch": 7.732201791607732, "grad_norm": 1.5652415752410889, "learning_rate": 3.8499654050889754e-05, "loss": 0.1571, "num_input_tokens_seen": 16503552, "step": 16400 }, { "epoch": 7.7345591702027345, "grad_norm": 0.27762725949287415, "learning_rate": 3.8490995446993924e-05, "loss": 0.0558, "num_input_tokens_seen": 16508320, "step": 16405 }, { "epoch": 7.736916548797737, "grad_norm": 0.09160267561674118, "learning_rate": 3.848233455930164e-05, "loss": 0.0274, "num_input_tokens_seen": 16513632, "step": 16410 }, { "epoch": 7.739273927392739, "grad_norm": 1.0423741340637207, "learning_rate": 3.847367138927902e-05, "loss": 0.1234, "num_input_tokens_seen": 16519200, "step": 16415 }, { "epoch": 7.741631305987742, "grad_norm": 1.5030196905136108, "learning_rate": 3.846500593839262e-05, "loss": 0.2398, "num_input_tokens_seen": 16524352, "step": 16420 }, { "epoch": 7.743988684582744, "grad_norm": 0.35707464814186096, "learning_rate": 3.8456338208109324e-05, "loss": 0.0428, "num_input_tokens_seen": 16529344, "step": 16425 }, { "epoch": 7.7463460631777465, "grad_norm": 0.14197759330272675, "learning_rate": 3.844766819989645e-05, "loss": 0.0497, "num_input_tokens_seen": 16535392, "step": 16430 }, { "epoch": 7.748703441772749, "grad_norm": 0.5615251064300537, "learning_rate": 3.8438995915221677e-05, "loss": 0.0844, "num_input_tokens_seen": 16540192, "step": 16435 }, { "epoch": 7.751060820367751, "grad_norm": 1.0359493494033813, "learning_rate": 3.843032135555308e-05, "loss": 0.0861, "num_input_tokens_seen": 16546208, "step": 16440 }, { "epoch": 7.753418198962754, "grad_norm": 0.33092737197875977, "learning_rate": 3.8421644522359105e-05, "loss": 0.1858, "num_input_tokens_seen": 16550848, "step": 16445 }, { "epoch": 7.755775577557756, "grad_norm": 0.032714128494262695, "learning_rate": 3.8412965417108606e-05, "loss": 0.0948, "num_input_tokens_seen": 16555104, "step": 16450 }, { "epoch": 7.7581329561527586, "grad_norm": 1.1890654563903809, "learning_rate": 3.8404284041270797e-05, "loss": 0.089, "num_input_tokens_seen": 16559488, "step": 16455 }, { "epoch": 7.760490334747761, "grad_norm": 0.3123040199279785, "learning_rate": 3.839560039631529e-05, "loss": 0.0943, "num_input_tokens_seen": 16564256, "step": 16460 }, { "epoch": 7.7628477133427625, "grad_norm": 0.3318323791027069, "learning_rate": 3.8386914483712085e-05, "loss": 0.1629, "num_input_tokens_seen": 16568832, "step": 16465 }, { "epoch": 7.765205091937765, "grad_norm": 0.8074653148651123, "learning_rate": 3.8378226304931545e-05, "loss": 0.1746, "num_input_tokens_seen": 16573888, "step": 16470 }, { "epoch": 7.767562470532767, "grad_norm": 0.6628521680831909, "learning_rate": 3.836953586144445e-05, "loss": 0.2049, "num_input_tokens_seen": 16578336, "step": 16475 }, { "epoch": 7.76991984912777, "grad_norm": 1.0901528596878052, "learning_rate": 3.836084315472193e-05, "loss": 0.11, "num_input_tokens_seen": 16582400, "step": 16480 }, { "epoch": 7.772277227722772, "grad_norm": 0.6148816347122192, "learning_rate": 3.835214818623553e-05, "loss": 0.1954, "num_input_tokens_seen": 16587712, "step": 16485 }, { "epoch": 7.7746346063177745, "grad_norm": 0.30153918266296387, "learning_rate": 3.8343450957457136e-05, "loss": 0.1615, "num_input_tokens_seen": 16592576, "step": 16490 }, { "epoch": 7.776991984912777, "grad_norm": 1.4284199476242065, "learning_rate": 3.833475146985905e-05, "loss": 0.1854, "num_input_tokens_seen": 16597344, "step": 16495 }, { "epoch": 7.779349363507779, "grad_norm": 0.2071966528892517, "learning_rate": 3.832604972491396e-05, "loss": 0.0392, "num_input_tokens_seen": 16601856, "step": 16500 }, { "epoch": 7.781706742102782, "grad_norm": 1.2864325046539307, "learning_rate": 3.831734572409492e-05, "loss": 0.2827, "num_input_tokens_seen": 16606304, "step": 16505 }, { "epoch": 7.784064120697784, "grad_norm": 1.022572636604309, "learning_rate": 3.830863946887536e-05, "loss": 0.1058, "num_input_tokens_seen": 16611264, "step": 16510 }, { "epoch": 7.7864214992927865, "grad_norm": 0.46079206466674805, "learning_rate": 3.82999309607291e-05, "loss": 0.1349, "num_input_tokens_seen": 16616256, "step": 16515 }, { "epoch": 7.788778877887789, "grad_norm": 0.8854273557662964, "learning_rate": 3.829122020113035e-05, "loss": 0.074, "num_input_tokens_seen": 16621920, "step": 16520 }, { "epoch": 7.791136256482791, "grad_norm": 0.6669622659683228, "learning_rate": 3.82825071915537e-05, "loss": 0.1071, "num_input_tokens_seen": 16628448, "step": 16525 }, { "epoch": 7.793493635077794, "grad_norm": 0.1994415670633316, "learning_rate": 3.8273791933474104e-05, "loss": 0.1032, "num_input_tokens_seen": 16634624, "step": 16530 }, { "epoch": 7.795851013672796, "grad_norm": 0.06697599589824677, "learning_rate": 3.82650744283669e-05, "loss": 0.1841, "num_input_tokens_seen": 16639712, "step": 16535 }, { "epoch": 7.7982083922677985, "grad_norm": 0.8690854907035828, "learning_rate": 3.825635467770783e-05, "loss": 0.167, "num_input_tokens_seen": 16644800, "step": 16540 }, { "epoch": 7.800565770862801, "grad_norm": 0.14522434771060944, "learning_rate": 3.8247632682973e-05, "loss": 0.0382, "num_input_tokens_seen": 16650240, "step": 16545 }, { "epoch": 7.802923149457803, "grad_norm": 0.7492436170578003, "learning_rate": 3.8238908445638886e-05, "loss": 0.2372, "num_input_tokens_seen": 16655552, "step": 16550 }, { "epoch": 7.805280528052805, "grad_norm": 0.05450417473912239, "learning_rate": 3.823018196718235e-05, "loss": 0.2275, "num_input_tokens_seen": 16660096, "step": 16555 }, { "epoch": 7.807637906647807, "grad_norm": 2.0139455795288086, "learning_rate": 3.822145324908064e-05, "loss": 0.0983, "num_input_tokens_seen": 16665056, "step": 16560 }, { "epoch": 7.80999528524281, "grad_norm": 1.5628492832183838, "learning_rate": 3.821272229281139e-05, "loss": 0.1059, "num_input_tokens_seen": 16671072, "step": 16565 }, { "epoch": 7.812352663837812, "grad_norm": 0.3027603328227997, "learning_rate": 3.82039890998526e-05, "loss": 0.1204, "num_input_tokens_seen": 16676672, "step": 16570 }, { "epoch": 7.814710042432814, "grad_norm": 0.6361892223358154, "learning_rate": 3.8195253671682634e-05, "loss": 0.1558, "num_input_tokens_seen": 16680608, "step": 16575 }, { "epoch": 7.817067421027817, "grad_norm": 0.6371406316757202, "learning_rate": 3.818651600978027e-05, "loss": 0.0451, "num_input_tokens_seen": 16685664, "step": 16580 }, { "epoch": 7.819424799622819, "grad_norm": 0.42375367879867554, "learning_rate": 3.817777611562464e-05, "loss": 0.0546, "num_input_tokens_seen": 16690560, "step": 16585 }, { "epoch": 7.821782178217822, "grad_norm": 0.06940038502216339, "learning_rate": 3.8169033990695255e-05, "loss": 0.043, "num_input_tokens_seen": 16695648, "step": 16590 }, { "epoch": 7.824139556812824, "grad_norm": 1.0630768537521362, "learning_rate": 3.816028963647201e-05, "loss": 0.21, "num_input_tokens_seen": 16700672, "step": 16595 }, { "epoch": 7.826496935407826, "grad_norm": 0.39607107639312744, "learning_rate": 3.815154305443519e-05, "loss": 0.0918, "num_input_tokens_seen": 16705792, "step": 16600 }, { "epoch": 7.828854314002829, "grad_norm": 1.834196925163269, "learning_rate": 3.814279424606543e-05, "loss": 0.1612, "num_input_tokens_seen": 16711424, "step": 16605 }, { "epoch": 7.831211692597831, "grad_norm": 0.10640683770179749, "learning_rate": 3.813404321284375e-05, "loss": 0.406, "num_input_tokens_seen": 16715968, "step": 16610 }, { "epoch": 7.833569071192834, "grad_norm": 0.4896719455718994, "learning_rate": 3.812528995625156e-05, "loss": 0.1018, "num_input_tokens_seen": 16720160, "step": 16615 }, { "epoch": 7.835926449787836, "grad_norm": 0.6938389539718628, "learning_rate": 3.8116534477770634e-05, "loss": 0.158, "num_input_tokens_seen": 16726496, "step": 16620 }, { "epoch": 7.838283828382838, "grad_norm": 0.6465644240379333, "learning_rate": 3.8107776778883125e-05, "loss": 0.122, "num_input_tokens_seen": 16732352, "step": 16625 }, { "epoch": 7.840641206977841, "grad_norm": 0.14304353296756744, "learning_rate": 3.809901686107157e-05, "loss": 0.0543, "num_input_tokens_seen": 16738080, "step": 16630 }, { "epoch": 7.842998585572843, "grad_norm": 0.18366822600364685, "learning_rate": 3.8090254725818864e-05, "loss": 0.0638, "num_input_tokens_seen": 16743328, "step": 16635 }, { "epoch": 7.845355964167846, "grad_norm": 1.2608088254928589, "learning_rate": 3.80814903746083e-05, "loss": 0.1474, "num_input_tokens_seen": 16748992, "step": 16640 }, { "epoch": 7.847713342762848, "grad_norm": 0.6729748249053955, "learning_rate": 3.8072723808923514e-05, "loss": 0.0464, "num_input_tokens_seen": 16754112, "step": 16645 }, { "epoch": 7.8500707213578504, "grad_norm": 0.11050213128328323, "learning_rate": 3.806395503024857e-05, "loss": 0.227, "num_input_tokens_seen": 16759616, "step": 16650 }, { "epoch": 7.852428099952853, "grad_norm": 1.2204930782318115, "learning_rate": 3.805518404006784e-05, "loss": 0.2335, "num_input_tokens_seen": 16764064, "step": 16655 }, { "epoch": 7.854785478547855, "grad_norm": 0.7155949473381042, "learning_rate": 3.8046410839866116e-05, "loss": 0.0347, "num_input_tokens_seen": 16769760, "step": 16660 }, { "epoch": 7.857142857142857, "grad_norm": 0.1257636696100235, "learning_rate": 3.803763543112855e-05, "loss": 0.0435, "num_input_tokens_seen": 16773984, "step": 16665 }, { "epoch": 7.859500235737859, "grad_norm": 0.6808568835258484, "learning_rate": 3.8028857815340674e-05, "loss": 0.0673, "num_input_tokens_seen": 16780256, "step": 16670 }, { "epoch": 7.861857614332862, "grad_norm": 0.19893915951251984, "learning_rate": 3.802007799398839e-05, "loss": 0.0736, "num_input_tokens_seen": 16784736, "step": 16675 }, { "epoch": 7.864214992927864, "grad_norm": 1.1377586126327515, "learning_rate": 3.801129596855797e-05, "loss": 0.1521, "num_input_tokens_seen": 16789504, "step": 16680 }, { "epoch": 7.866572371522866, "grad_norm": 0.3925810754299164, "learning_rate": 3.8002511740536055e-05, "loss": 0.1447, "num_input_tokens_seen": 16794208, "step": 16685 }, { "epoch": 7.868929750117869, "grad_norm": 0.2280578315258026, "learning_rate": 3.799372531140968e-05, "loss": 0.2111, "num_input_tokens_seen": 16799680, "step": 16690 }, { "epoch": 7.871287128712871, "grad_norm": 0.22161173820495605, "learning_rate": 3.7984936682666225e-05, "loss": 0.0613, "num_input_tokens_seen": 16804320, "step": 16695 }, { "epoch": 7.873644507307874, "grad_norm": 0.10852399468421936, "learning_rate": 3.7976145855793464e-05, "loss": 0.2264, "num_input_tokens_seen": 16810240, "step": 16700 }, { "epoch": 7.876001885902876, "grad_norm": 0.2748870551586151, "learning_rate": 3.796735283227954e-05, "loss": 0.064, "num_input_tokens_seen": 16814880, "step": 16705 }, { "epoch": 7.878359264497878, "grad_norm": 1.3799415826797485, "learning_rate": 3.7958557613612936e-05, "loss": 0.1444, "num_input_tokens_seen": 16819328, "step": 16710 }, { "epoch": 7.880716643092881, "grad_norm": 0.05682607367634773, "learning_rate": 3.794976020128257e-05, "loss": 0.1309, "num_input_tokens_seen": 16824064, "step": 16715 }, { "epoch": 7.883074021687883, "grad_norm": 0.12508511543273926, "learning_rate": 3.7940960596777665e-05, "loss": 0.0423, "num_input_tokens_seen": 16828832, "step": 16720 }, { "epoch": 7.885431400282886, "grad_norm": 0.29265260696411133, "learning_rate": 3.7932158801587855e-05, "loss": 0.076, "num_input_tokens_seen": 16833216, "step": 16725 }, { "epoch": 7.887788778877888, "grad_norm": 0.14882972836494446, "learning_rate": 3.792335481720314e-05, "loss": 0.0881, "num_input_tokens_seen": 16837856, "step": 16730 }, { "epoch": 7.89014615747289, "grad_norm": 0.6435796022415161, "learning_rate": 3.791454864511389e-05, "loss": 0.3584, "num_input_tokens_seen": 16843872, "step": 16735 }, { "epoch": 7.892503536067893, "grad_norm": 0.5907211899757385, "learning_rate": 3.790574028681082e-05, "loss": 0.1589, "num_input_tokens_seen": 16849728, "step": 16740 }, { "epoch": 7.894860914662895, "grad_norm": 0.5226572751998901, "learning_rate": 3.789692974378505e-05, "loss": 0.1429, "num_input_tokens_seen": 16854592, "step": 16745 }, { "epoch": 7.897218293257898, "grad_norm": 0.1890159696340561, "learning_rate": 3.7888117017528046e-05, "loss": 0.0876, "num_input_tokens_seen": 16859808, "step": 16750 }, { "epoch": 7.899575671852899, "grad_norm": 0.9989511966705322, "learning_rate": 3.787930210953167e-05, "loss": 0.1622, "num_input_tokens_seen": 16864192, "step": 16755 }, { "epoch": 7.9019330504479015, "grad_norm": 1.1494252681732178, "learning_rate": 3.787048502128811e-05, "loss": 0.2264, "num_input_tokens_seen": 16869056, "step": 16760 }, { "epoch": 7.904290429042904, "grad_norm": 0.2487543672323227, "learning_rate": 3.786166575428997e-05, "loss": 0.1088, "num_input_tokens_seen": 16872672, "step": 16765 }, { "epoch": 7.906647807637906, "grad_norm": 0.2722160220146179, "learning_rate": 3.78528443100302e-05, "loss": 0.0888, "num_input_tokens_seen": 16877888, "step": 16770 }, { "epoch": 7.909005186232909, "grad_norm": 0.46276652812957764, "learning_rate": 3.784402069000211e-05, "loss": 0.0583, "num_input_tokens_seen": 16881984, "step": 16775 }, { "epoch": 7.911362564827911, "grad_norm": 1.3369053602218628, "learning_rate": 3.783519489569941e-05, "loss": 0.1668, "num_input_tokens_seen": 16886624, "step": 16780 }, { "epoch": 7.9137199434229135, "grad_norm": 0.2590189576148987, "learning_rate": 3.782636692861613e-05, "loss": 0.1722, "num_input_tokens_seen": 16892096, "step": 16785 }, { "epoch": 7.916077322017916, "grad_norm": 0.013691609725356102, "learning_rate": 3.781753679024672e-05, "loss": 0.0855, "num_input_tokens_seen": 16896960, "step": 16790 }, { "epoch": 7.918434700612918, "grad_norm": 0.5294360518455505, "learning_rate": 3.7808704482085956e-05, "loss": 0.0396, "num_input_tokens_seen": 16902272, "step": 16795 }, { "epoch": 7.920792079207921, "grad_norm": 1.6285513639450073, "learning_rate": 3.7799870005629e-05, "loss": 0.1813, "num_input_tokens_seen": 16907136, "step": 16800 }, { "epoch": 7.923149457802923, "grad_norm": 0.40168634057044983, "learning_rate": 3.779103336237138e-05, "loss": 0.1426, "num_input_tokens_seen": 16911680, "step": 16805 }, { "epoch": 7.9255068363979255, "grad_norm": 0.02709430269896984, "learning_rate": 3.7782194553808995e-05, "loss": 0.1607, "num_input_tokens_seen": 16916640, "step": 16810 }, { "epoch": 7.927864214992928, "grad_norm": 0.6728273630142212, "learning_rate": 3.77733535814381e-05, "loss": 0.0711, "num_input_tokens_seen": 16920672, "step": 16815 }, { "epoch": 7.93022159358793, "grad_norm": 1.3011656999588013, "learning_rate": 3.776451044675532e-05, "loss": 0.0887, "num_input_tokens_seen": 16925472, "step": 16820 }, { "epoch": 7.932578972182933, "grad_norm": 0.309010773897171, "learning_rate": 3.7755665151257654e-05, "loss": 0.0637, "num_input_tokens_seen": 16930240, "step": 16825 }, { "epoch": 7.934936350777935, "grad_norm": 2.256226062774658, "learning_rate": 3.774681769644245e-05, "loss": 0.1304, "num_input_tokens_seen": 16935200, "step": 16830 }, { "epoch": 7.9372937293729375, "grad_norm": 0.11703174561262131, "learning_rate": 3.773796808380745e-05, "loss": 0.0914, "num_input_tokens_seen": 16941248, "step": 16835 }, { "epoch": 7.93965110796794, "grad_norm": 0.22517581284046173, "learning_rate": 3.7729116314850733e-05, "loss": 0.1464, "num_input_tokens_seen": 16946976, "step": 16840 }, { "epoch": 7.942008486562942, "grad_norm": 1.792596697807312, "learning_rate": 3.7720262391070736e-05, "loss": 0.0923, "num_input_tokens_seen": 16951360, "step": 16845 }, { "epoch": 7.944365865157945, "grad_norm": 0.15096670389175415, "learning_rate": 3.7711406313966303e-05, "loss": 0.0977, "num_input_tokens_seen": 16956512, "step": 16850 }, { "epoch": 7.946723243752947, "grad_norm": 0.05323576554656029, "learning_rate": 3.770254808503661e-05, "loss": 0.1233, "num_input_tokens_seen": 16961536, "step": 16855 }, { "epoch": 7.9490806223479495, "grad_norm": 1.511902928352356, "learning_rate": 3.7693687705781203e-05, "loss": 0.1786, "num_input_tokens_seen": 16967360, "step": 16860 }, { "epoch": 7.951438000942951, "grad_norm": 1.4917387962341309, "learning_rate": 3.7684825177699995e-05, "loss": 0.1263, "num_input_tokens_seen": 16972704, "step": 16865 }, { "epoch": 7.9537953795379535, "grad_norm": 0.21098318696022034, "learning_rate": 3.7675960502293255e-05, "loss": 0.1422, "num_input_tokens_seen": 16977760, "step": 16870 }, { "epoch": 7.956152758132956, "grad_norm": 0.08831902593374252, "learning_rate": 3.766709368106163e-05, "loss": 0.0544, "num_input_tokens_seen": 16983552, "step": 16875 }, { "epoch": 7.958510136727958, "grad_norm": 1.4436051845550537, "learning_rate": 3.765822471550612e-05, "loss": 0.2577, "num_input_tokens_seen": 16988480, "step": 16880 }, { "epoch": 7.960867515322961, "grad_norm": 2.191728353500366, "learning_rate": 3.76493536071281e-05, "loss": 0.1363, "num_input_tokens_seen": 16992928, "step": 16885 }, { "epoch": 7.963224893917963, "grad_norm": 0.5397895574569702, "learning_rate": 3.7640480357429274e-05, "loss": 0.1031, "num_input_tokens_seen": 16997472, "step": 16890 }, { "epoch": 7.9655822725129655, "grad_norm": 0.2846635580062866, "learning_rate": 3.763160496791175e-05, "loss": 0.0754, "num_input_tokens_seen": 17002528, "step": 16895 }, { "epoch": 7.967939651107968, "grad_norm": 0.45750677585601807, "learning_rate": 3.762272744007798e-05, "loss": 0.2309, "num_input_tokens_seen": 17007648, "step": 16900 }, { "epoch": 7.97029702970297, "grad_norm": 0.34796077013015747, "learning_rate": 3.7613847775430776e-05, "loss": 0.2064, "num_input_tokens_seen": 17013440, "step": 16905 }, { "epoch": 7.972654408297973, "grad_norm": 0.22302255034446716, "learning_rate": 3.760496597547332e-05, "loss": 0.0785, "num_input_tokens_seen": 17018048, "step": 16910 }, { "epoch": 7.975011786892975, "grad_norm": 0.4126151502132416, "learning_rate": 3.759608204170914e-05, "loss": 0.0543, "num_input_tokens_seen": 17024960, "step": 16915 }, { "epoch": 7.9773691654879775, "grad_norm": 0.1858762800693512, "learning_rate": 3.758719597564213e-05, "loss": 0.0217, "num_input_tokens_seen": 17031232, "step": 16920 }, { "epoch": 7.97972654408298, "grad_norm": 0.28893211483955383, "learning_rate": 3.7578307778776564e-05, "loss": 0.047, "num_input_tokens_seen": 17036512, "step": 16925 }, { "epoch": 7.982083922677982, "grad_norm": 1.1544429063796997, "learning_rate": 3.756941745261707e-05, "loss": 0.06, "num_input_tokens_seen": 17042112, "step": 16930 }, { "epoch": 7.984441301272985, "grad_norm": 2.2481863498687744, "learning_rate": 3.75605249986686e-05, "loss": 0.2452, "num_input_tokens_seen": 17048000, "step": 16935 }, { "epoch": 7.986798679867987, "grad_norm": 1.4341151714324951, "learning_rate": 3.755163041843652e-05, "loss": 0.1119, "num_input_tokens_seen": 17053280, "step": 16940 }, { "epoch": 7.9891560584629895, "grad_norm": 0.09349039196968079, "learning_rate": 3.7542733713426536e-05, "loss": 0.1418, "num_input_tokens_seen": 17058784, "step": 16945 }, { "epoch": 7.991513437057992, "grad_norm": 0.08709695190191269, "learning_rate": 3.753383488514469e-05, "loss": 0.2862, "num_input_tokens_seen": 17063136, "step": 16950 }, { "epoch": 7.993870815652993, "grad_norm": 1.6691299676895142, "learning_rate": 3.7524933935097405e-05, "loss": 0.1414, "num_input_tokens_seen": 17069600, "step": 16955 }, { "epoch": 7.996228194247996, "grad_norm": 0.25309252738952637, "learning_rate": 3.7516030864791474e-05, "loss": 0.0459, "num_input_tokens_seen": 17074400, "step": 16960 }, { "epoch": 7.998585572842998, "grad_norm": 1.5468151569366455, "learning_rate": 3.7507125675734026e-05, "loss": 0.0805, "num_input_tokens_seen": 17079296, "step": 16965 }, { "epoch": 8.0, "eval_loss": 0.15088850259780884, "eval_runtime": 15.0747, "eval_samples_per_second": 62.555, "eval_steps_per_second": 15.655, "num_input_tokens_seen": 17082336, "step": 16968 }, { "epoch": 8.000942951438, "grad_norm": 2.6355881690979004, "learning_rate": 3.749821836943256e-05, "loss": 0.4733, "num_input_tokens_seen": 17085120, "step": 16970 }, { "epoch": 8.003300330033003, "grad_norm": 0.27055710554122925, "learning_rate": 3.748930894739493e-05, "loss": 0.0358, "num_input_tokens_seen": 17089824, "step": 16975 }, { "epoch": 8.005657708628005, "grad_norm": 0.35269784927368164, "learning_rate": 3.748039741112935e-05, "loss": 0.0941, "num_input_tokens_seen": 17094624, "step": 16980 }, { "epoch": 8.008015087223008, "grad_norm": 0.019250186160206795, "learning_rate": 3.74714837621444e-05, "loss": 0.0555, "num_input_tokens_seen": 17099520, "step": 16985 }, { "epoch": 8.01037246581801, "grad_norm": 0.0821886956691742, "learning_rate": 3.7462568001949e-05, "loss": 0.0504, "num_input_tokens_seen": 17104416, "step": 16990 }, { "epoch": 8.012729844413013, "grad_norm": 0.7739474773406982, "learning_rate": 3.745365013205243e-05, "loss": 0.0798, "num_input_tokens_seen": 17109440, "step": 16995 }, { "epoch": 8.015087223008015, "grad_norm": 0.16844429075717926, "learning_rate": 3.744473015396436e-05, "loss": 0.0564, "num_input_tokens_seen": 17115104, "step": 17000 }, { "epoch": 8.017444601603017, "grad_norm": 0.18539975583553314, "learning_rate": 3.743580806919476e-05, "loss": 0.1551, "num_input_tokens_seen": 17120000, "step": 17005 }, { "epoch": 8.01980198019802, "grad_norm": 0.8923903107643127, "learning_rate": 3.742688387925402e-05, "loss": 0.3367, "num_input_tokens_seen": 17124864, "step": 17010 }, { "epoch": 8.022159358793022, "grad_norm": 0.27866125106811523, "learning_rate": 3.741795758565281e-05, "loss": 0.096, "num_input_tokens_seen": 17129760, "step": 17015 }, { "epoch": 8.024516737388025, "grad_norm": 1.2032971382141113, "learning_rate": 3.7409029189902245e-05, "loss": 0.106, "num_input_tokens_seen": 17134752, "step": 17020 }, { "epoch": 8.026874115983027, "grad_norm": 0.6446435451507568, "learning_rate": 3.740009869351372e-05, "loss": 0.1633, "num_input_tokens_seen": 17139872, "step": 17025 }, { "epoch": 8.02923149457803, "grad_norm": 0.42680737376213074, "learning_rate": 3.7391166097999045e-05, "loss": 0.078, "num_input_tokens_seen": 17145344, "step": 17030 }, { "epoch": 8.031588873173032, "grad_norm": 0.8762369751930237, "learning_rate": 3.7382231404870325e-05, "loss": 0.106, "num_input_tokens_seen": 17150336, "step": 17035 }, { "epoch": 8.033946251768034, "grad_norm": 0.9562256336212158, "learning_rate": 3.737329461564007e-05, "loss": 0.1836, "num_input_tokens_seen": 17155328, "step": 17040 }, { "epoch": 8.036303630363037, "grad_norm": 0.28735071420669556, "learning_rate": 3.736435573182111e-05, "loss": 0.0936, "num_input_tokens_seen": 17160128, "step": 17045 }, { "epoch": 8.038661008958039, "grad_norm": 1.6990081071853638, "learning_rate": 3.735541475492668e-05, "loss": 0.101, "num_input_tokens_seen": 17165728, "step": 17050 }, { "epoch": 8.041018387553041, "grad_norm": 1.2643311023712158, "learning_rate": 3.734647168647031e-05, "loss": 0.0579, "num_input_tokens_seen": 17169952, "step": 17055 }, { "epoch": 8.043375766148044, "grad_norm": 0.7371531128883362, "learning_rate": 3.7337526527965906e-05, "loss": 0.1636, "num_input_tokens_seen": 17175008, "step": 17060 }, { "epoch": 8.045733144743046, "grad_norm": 0.1552770584821701, "learning_rate": 3.732857928092774e-05, "loss": 0.1047, "num_input_tokens_seen": 17179552, "step": 17065 }, { "epoch": 8.048090523338049, "grad_norm": 0.07845120877027512, "learning_rate": 3.7319629946870444e-05, "loss": 0.1286, "num_input_tokens_seen": 17184608, "step": 17070 }, { "epoch": 8.050447901933051, "grad_norm": 0.8396021127700806, "learning_rate": 3.7310678527308965e-05, "loss": 0.1314, "num_input_tokens_seen": 17190080, "step": 17075 }, { "epoch": 8.052805280528053, "grad_norm": 1.824076771736145, "learning_rate": 3.7301725023758635e-05, "loss": 0.0885, "num_input_tokens_seen": 17195136, "step": 17080 }, { "epoch": 8.055162659123056, "grad_norm": 1.6183151006698608, "learning_rate": 3.7292769437735126e-05, "loss": 0.1736, "num_input_tokens_seen": 17201344, "step": 17085 }, { "epoch": 8.057520037718058, "grad_norm": 3.2062885761260986, "learning_rate": 3.728381177075447e-05, "loss": 0.2111, "num_input_tokens_seen": 17206016, "step": 17090 }, { "epoch": 8.05987741631306, "grad_norm": 0.06869488209486008, "learning_rate": 3.7274852024333054e-05, "loss": 0.0955, "num_input_tokens_seen": 17211040, "step": 17095 }, { "epoch": 8.062234794908063, "grad_norm": 0.8332776427268982, "learning_rate": 3.726589019998761e-05, "loss": 0.1463, "num_input_tokens_seen": 17216288, "step": 17100 }, { "epoch": 8.064592173503065, "grad_norm": 0.3012668490409851, "learning_rate": 3.725692629923521e-05, "loss": 0.2897, "num_input_tokens_seen": 17222880, "step": 17105 }, { "epoch": 8.066949552098066, "grad_norm": 0.16656805574893951, "learning_rate": 3.72479603235933e-05, "loss": 0.2187, "num_input_tokens_seen": 17227392, "step": 17110 }, { "epoch": 8.069306930693068, "grad_norm": 0.4363645017147064, "learning_rate": 3.723899227457967e-05, "loss": 0.0373, "num_input_tokens_seen": 17232512, "step": 17115 }, { "epoch": 8.07166430928807, "grad_norm": 0.5266637802124023, "learning_rate": 3.7230022153712446e-05, "loss": 0.2, "num_input_tokens_seen": 17237568, "step": 17120 }, { "epoch": 8.074021687883073, "grad_norm": 3.043128490447998, "learning_rate": 3.722104996251013e-05, "loss": 0.4364, "num_input_tokens_seen": 17243808, "step": 17125 }, { "epoch": 8.076379066478076, "grad_norm": 0.9876762628555298, "learning_rate": 3.721207570249155e-05, "loss": 0.2482, "num_input_tokens_seen": 17248256, "step": 17130 }, { "epoch": 8.078736445073078, "grad_norm": 0.13685476779937744, "learning_rate": 3.720309937517592e-05, "loss": 0.2209, "num_input_tokens_seen": 17252832, "step": 17135 }, { "epoch": 8.08109382366808, "grad_norm": 2.0306546688079834, "learning_rate": 3.719412098208275e-05, "loss": 0.138, "num_input_tokens_seen": 17258560, "step": 17140 }, { "epoch": 8.083451202263083, "grad_norm": 0.6137235164642334, "learning_rate": 3.718514052473194e-05, "loss": 0.0388, "num_input_tokens_seen": 17262272, "step": 17145 }, { "epoch": 8.085808580858085, "grad_norm": 0.12533175945281982, "learning_rate": 3.717615800464373e-05, "loss": 0.2718, "num_input_tokens_seen": 17267360, "step": 17150 }, { "epoch": 8.088165959453088, "grad_norm": 1.0207983255386353, "learning_rate": 3.716717342333872e-05, "loss": 0.1106, "num_input_tokens_seen": 17271360, "step": 17155 }, { "epoch": 8.09052333804809, "grad_norm": 0.5016215443611145, "learning_rate": 3.7158186782337825e-05, "loss": 0.0634, "num_input_tokens_seen": 17276640, "step": 17160 }, { "epoch": 8.092880716643092, "grad_norm": 0.7713665962219238, "learning_rate": 3.714919808316235e-05, "loss": 0.1897, "num_input_tokens_seen": 17281216, "step": 17165 }, { "epoch": 8.095238095238095, "grad_norm": 0.5570078492164612, "learning_rate": 3.714020732733392e-05, "loss": 0.1158, "num_input_tokens_seen": 17286624, "step": 17170 }, { "epoch": 8.097595473833097, "grad_norm": 1.3830621242523193, "learning_rate": 3.713121451637451e-05, "loss": 0.216, "num_input_tokens_seen": 17291744, "step": 17175 }, { "epoch": 8.0999528524281, "grad_norm": 1.5544909238815308, "learning_rate": 3.712221965180647e-05, "loss": 0.2238, "num_input_tokens_seen": 17297280, "step": 17180 }, { "epoch": 8.102310231023102, "grad_norm": 1.2526692152023315, "learning_rate": 3.711322273515247e-05, "loss": 0.1584, "num_input_tokens_seen": 17301760, "step": 17185 }, { "epoch": 8.104667609618105, "grad_norm": 0.6666073203086853, "learning_rate": 3.710422376793551e-05, "loss": 0.2124, "num_input_tokens_seen": 17306976, "step": 17190 }, { "epoch": 8.107024988213107, "grad_norm": 0.1596747636795044, "learning_rate": 3.7095222751679e-05, "loss": 0.0353, "num_input_tokens_seen": 17311680, "step": 17195 }, { "epoch": 8.10938236680811, "grad_norm": 0.7310019731521606, "learning_rate": 3.7086219687906644e-05, "loss": 0.1342, "num_input_tokens_seen": 17316704, "step": 17200 }, { "epoch": 8.111739745403112, "grad_norm": 0.09527519345283508, "learning_rate": 3.7077214578142504e-05, "loss": 0.0929, "num_input_tokens_seen": 17321408, "step": 17205 }, { "epoch": 8.114097123998114, "grad_norm": 0.304465115070343, "learning_rate": 3.706820742391099e-05, "loss": 0.1065, "num_input_tokens_seen": 17326464, "step": 17210 }, { "epoch": 8.116454502593117, "grad_norm": 2.040691614151001, "learning_rate": 3.7059198226736876e-05, "loss": 0.1835, "num_input_tokens_seen": 17332736, "step": 17215 }, { "epoch": 8.118811881188119, "grad_norm": 0.9274699091911316, "learning_rate": 3.705018698814525e-05, "loss": 0.1729, "num_input_tokens_seen": 17337216, "step": 17220 }, { "epoch": 8.121169259783121, "grad_norm": 0.9407032132148743, "learning_rate": 3.704117370966158e-05, "loss": 0.2506, "num_input_tokens_seen": 17341664, "step": 17225 }, { "epoch": 8.123526638378124, "grad_norm": 0.3789234459400177, "learning_rate": 3.703215839281163e-05, "loss": 0.057, "num_input_tokens_seen": 17346048, "step": 17230 }, { "epoch": 8.125884016973126, "grad_norm": 0.0752778872847557, "learning_rate": 3.7023141039121565e-05, "loss": 0.0728, "num_input_tokens_seen": 17350624, "step": 17235 }, { "epoch": 8.128241395568129, "grad_norm": 1.2529479265213013, "learning_rate": 3.7014121650117864e-05, "loss": 0.112, "num_input_tokens_seen": 17357088, "step": 17240 }, { "epoch": 8.130598774163131, "grad_norm": 0.03715832531452179, "learning_rate": 3.700510022732736e-05, "loss": 0.014, "num_input_tokens_seen": 17361696, "step": 17245 }, { "epoch": 8.132956152758133, "grad_norm": 0.20760215818881989, "learning_rate": 3.699607677227722e-05, "loss": 0.0361, "num_input_tokens_seen": 17368320, "step": 17250 }, { "epoch": 8.135313531353136, "grad_norm": 0.4593810737133026, "learning_rate": 3.698705128649496e-05, "loss": 0.1406, "num_input_tokens_seen": 17374624, "step": 17255 }, { "epoch": 8.137670909948138, "grad_norm": 1.6970528364181519, "learning_rate": 3.697802377150845e-05, "loss": 0.2459, "num_input_tokens_seen": 17379360, "step": 17260 }, { "epoch": 8.14002828854314, "grad_norm": 2.5185904502868652, "learning_rate": 3.696899422884589e-05, "loss": 0.2148, "num_input_tokens_seen": 17384128, "step": 17265 }, { "epoch": 8.142385667138143, "grad_norm": 0.2959481477737427, "learning_rate": 3.695996266003582e-05, "loss": 0.0895, "num_input_tokens_seen": 17388576, "step": 17270 }, { "epoch": 8.144743045733145, "grad_norm": 0.4287429749965668, "learning_rate": 3.6950929066607146e-05, "loss": 0.1656, "num_input_tokens_seen": 17394432, "step": 17275 }, { "epoch": 8.147100424328148, "grad_norm": 0.015013097785413265, "learning_rate": 3.6941893450089104e-05, "loss": 0.1328, "num_input_tokens_seen": 17401024, "step": 17280 }, { "epoch": 8.14945780292315, "grad_norm": 1.929368019104004, "learning_rate": 3.6932855812011256e-05, "loss": 0.1061, "num_input_tokens_seen": 17406112, "step": 17285 }, { "epoch": 8.151815181518153, "grad_norm": 0.7566422820091248, "learning_rate": 3.6923816153903526e-05, "loss": 0.0489, "num_input_tokens_seen": 17411616, "step": 17290 }, { "epoch": 8.154172560113155, "grad_norm": 1.0569554567337036, "learning_rate": 3.691477447729618e-05, "loss": 0.093, "num_input_tokens_seen": 17416832, "step": 17295 }, { "epoch": 8.156529938708157, "grad_norm": 1.0839487314224243, "learning_rate": 3.690573078371981e-05, "loss": 0.0543, "num_input_tokens_seen": 17422080, "step": 17300 }, { "epoch": 8.15888731730316, "grad_norm": 1.8076571226119995, "learning_rate": 3.689668507470538e-05, "loss": 0.176, "num_input_tokens_seen": 17426912, "step": 17305 }, { "epoch": 8.16124469589816, "grad_norm": 0.9398935437202454, "learning_rate": 3.688763735178415e-05, "loss": 0.1872, "num_input_tokens_seen": 17433152, "step": 17310 }, { "epoch": 8.163602074493163, "grad_norm": 1.8288556337356567, "learning_rate": 3.687858761648776e-05, "loss": 0.184, "num_input_tokens_seen": 17438208, "step": 17315 }, { "epoch": 8.165959453088165, "grad_norm": 0.20774778723716736, "learning_rate": 3.686953587034817e-05, "loss": 0.076, "num_input_tokens_seen": 17443328, "step": 17320 }, { "epoch": 8.168316831683168, "grad_norm": 0.14473438262939453, "learning_rate": 3.6860482114897696e-05, "loss": 0.1784, "num_input_tokens_seen": 17447840, "step": 17325 }, { "epoch": 8.17067421027817, "grad_norm": 2.2745769023895264, "learning_rate": 3.685142635166897e-05, "loss": 0.1642, "num_input_tokens_seen": 17452544, "step": 17330 }, { "epoch": 8.173031588873172, "grad_norm": 1.1463431119918823, "learning_rate": 3.6842368582195e-05, "loss": 0.2077, "num_input_tokens_seen": 17457824, "step": 17335 }, { "epoch": 8.175388967468175, "grad_norm": 0.3970834016799927, "learning_rate": 3.68333088080091e-05, "loss": 0.0303, "num_input_tokens_seen": 17462496, "step": 17340 }, { "epoch": 8.177746346063177, "grad_norm": 1.5504707098007202, "learning_rate": 3.6824247030644935e-05, "loss": 0.1419, "num_input_tokens_seen": 17467200, "step": 17345 }, { "epoch": 8.18010372465818, "grad_norm": 1.2255390882492065, "learning_rate": 3.681518325163651e-05, "loss": 0.0752, "num_input_tokens_seen": 17471296, "step": 17350 }, { "epoch": 8.182461103253182, "grad_norm": 0.06348011642694473, "learning_rate": 3.680611747251816e-05, "loss": 0.0809, "num_input_tokens_seen": 17475904, "step": 17355 }, { "epoch": 8.184818481848184, "grad_norm": 0.5584111213684082, "learning_rate": 3.67970496948246e-05, "loss": 0.2744, "num_input_tokens_seen": 17480192, "step": 17360 }, { "epoch": 8.187175860443187, "grad_norm": 0.053171806037425995, "learning_rate": 3.6787979920090816e-05, "loss": 0.0572, "num_input_tokens_seen": 17484288, "step": 17365 }, { "epoch": 8.18953323903819, "grad_norm": 2.1754531860351562, "learning_rate": 3.677890814985218e-05, "loss": 0.2616, "num_input_tokens_seen": 17488608, "step": 17370 }, { "epoch": 8.191890617633192, "grad_norm": 0.12180937826633453, "learning_rate": 3.6769834385644385e-05, "loss": 0.2157, "num_input_tokens_seen": 17493120, "step": 17375 }, { "epoch": 8.194247996228194, "grad_norm": 1.0402820110321045, "learning_rate": 3.676075862900348e-05, "loss": 0.0877, "num_input_tokens_seen": 17497376, "step": 17380 }, { "epoch": 8.196605374823196, "grad_norm": 0.2729514539241791, "learning_rate": 3.675168088146582e-05, "loss": 0.1672, "num_input_tokens_seen": 17503008, "step": 17385 }, { "epoch": 8.198962753418199, "grad_norm": 0.09259594231843948, "learning_rate": 3.674260114456812e-05, "loss": 0.0155, "num_input_tokens_seen": 17508992, "step": 17390 }, { "epoch": 8.201320132013201, "grad_norm": 1.0691006183624268, "learning_rate": 3.673351941984742e-05, "loss": 0.1648, "num_input_tokens_seen": 17514880, "step": 17395 }, { "epoch": 8.203677510608204, "grad_norm": 2.0956928730010986, "learning_rate": 3.6724435708841114e-05, "loss": 0.0887, "num_input_tokens_seen": 17519936, "step": 17400 }, { "epoch": 8.206034889203206, "grad_norm": 1.2293107509613037, "learning_rate": 3.671535001308691e-05, "loss": 0.1173, "num_input_tokens_seen": 17525408, "step": 17405 }, { "epoch": 8.208392267798208, "grad_norm": 1.0502172708511353, "learning_rate": 3.670626233412286e-05, "loss": 0.1605, "num_input_tokens_seen": 17531744, "step": 17410 }, { "epoch": 8.21074964639321, "grad_norm": 0.40364933013916016, "learning_rate": 3.669717267348736e-05, "loss": 0.0644, "num_input_tokens_seen": 17535936, "step": 17415 }, { "epoch": 8.213107024988213, "grad_norm": 0.007652191445231438, "learning_rate": 3.6688081032719134e-05, "loss": 0.0345, "num_input_tokens_seen": 17540032, "step": 17420 }, { "epoch": 8.215464403583216, "grad_norm": 0.5722139477729797, "learning_rate": 3.667898741335724e-05, "loss": 0.0444, "num_input_tokens_seen": 17544672, "step": 17425 }, { "epoch": 8.217821782178218, "grad_norm": 0.3353331685066223, "learning_rate": 3.666989181694107e-05, "loss": 0.0954, "num_input_tokens_seen": 17549920, "step": 17430 }, { "epoch": 8.22017916077322, "grad_norm": 0.2124062329530716, "learning_rate": 3.666079424501037e-05, "loss": 0.1944, "num_input_tokens_seen": 17554560, "step": 17435 }, { "epoch": 8.222536539368223, "grad_norm": 0.02242831513285637, "learning_rate": 3.665169469910518e-05, "loss": 0.065, "num_input_tokens_seen": 17559072, "step": 17440 }, { "epoch": 8.224893917963225, "grad_norm": 0.418524831533432, "learning_rate": 3.664259318076592e-05, "loss": 0.1394, "num_input_tokens_seen": 17563456, "step": 17445 }, { "epoch": 8.227251296558228, "grad_norm": 3.911456346511841, "learning_rate": 3.6633489691533315e-05, "loss": 0.2816, "num_input_tokens_seen": 17567456, "step": 17450 }, { "epoch": 8.22960867515323, "grad_norm": 2.1753578186035156, "learning_rate": 3.662438423294843e-05, "loss": 0.4686, "num_input_tokens_seen": 17572864, "step": 17455 }, { "epoch": 8.231966053748232, "grad_norm": 0.4951906204223633, "learning_rate": 3.661527680655267e-05, "loss": 0.091, "num_input_tokens_seen": 17578528, "step": 17460 }, { "epoch": 8.234323432343235, "grad_norm": 0.9240542650222778, "learning_rate": 3.6606167413887754e-05, "loss": 0.0882, "num_input_tokens_seen": 17583744, "step": 17465 }, { "epoch": 8.236680810938237, "grad_norm": 0.27539175748825073, "learning_rate": 3.659705605649576e-05, "loss": 0.0692, "num_input_tokens_seen": 17588416, "step": 17470 }, { "epoch": 8.23903818953324, "grad_norm": 2.2410435676574707, "learning_rate": 3.658794273591908e-05, "loss": 0.1346, "num_input_tokens_seen": 17594336, "step": 17475 }, { "epoch": 8.241395568128242, "grad_norm": 0.27871713042259216, "learning_rate": 3.657882745370045e-05, "loss": 0.1136, "num_input_tokens_seen": 17598880, "step": 17480 }, { "epoch": 8.243752946723244, "grad_norm": 0.761456310749054, "learning_rate": 3.656971021138293e-05, "loss": 0.1409, "num_input_tokens_seen": 17603488, "step": 17485 }, { "epoch": 8.246110325318247, "grad_norm": 0.016981158405542374, "learning_rate": 3.656059101050991e-05, "loss": 0.0388, "num_input_tokens_seen": 17609536, "step": 17490 }, { "epoch": 8.24846770391325, "grad_norm": 0.8131532669067383, "learning_rate": 3.655146985262512e-05, "loss": 0.2091, "num_input_tokens_seen": 17613952, "step": 17495 }, { "epoch": 8.250825082508252, "grad_norm": 0.3086029589176178, "learning_rate": 3.654234673927261e-05, "loss": 0.1117, "num_input_tokens_seen": 17618144, "step": 17500 }, { "epoch": 8.253182461103254, "grad_norm": 1.9316908121109009, "learning_rate": 3.6533221671996796e-05, "loss": 0.0808, "num_input_tokens_seen": 17622880, "step": 17505 }, { "epoch": 8.255539839698255, "grad_norm": 1.654840350151062, "learning_rate": 3.652409465234236e-05, "loss": 0.2825, "num_input_tokens_seen": 17627200, "step": 17510 }, { "epoch": 8.257897218293257, "grad_norm": 0.2382909506559372, "learning_rate": 3.651496568185437e-05, "loss": 0.1057, "num_input_tokens_seen": 17636096, "step": 17515 }, { "epoch": 8.26025459688826, "grad_norm": 0.4030342698097229, "learning_rate": 3.65058347620782e-05, "loss": 0.2172, "num_input_tokens_seen": 17641792, "step": 17520 }, { "epoch": 8.262611975483262, "grad_norm": 0.1464708298444748, "learning_rate": 3.649670189455957e-05, "loss": 0.0888, "num_input_tokens_seen": 17645920, "step": 17525 }, { "epoch": 8.264969354078264, "grad_norm": 0.024561038240790367, "learning_rate": 3.6487567080844507e-05, "loss": 0.0931, "num_input_tokens_seen": 17651968, "step": 17530 }, { "epoch": 8.267326732673267, "grad_norm": 0.44154247641563416, "learning_rate": 3.6478430322479383e-05, "loss": 0.0991, "num_input_tokens_seen": 17657408, "step": 17535 }, { "epoch": 8.269684111268269, "grad_norm": 0.9847261905670166, "learning_rate": 3.64692916210109e-05, "loss": 0.0786, "num_input_tokens_seen": 17662752, "step": 17540 }, { "epoch": 8.272041489863271, "grad_norm": 1.2604047060012817, "learning_rate": 3.646015097798609e-05, "loss": 0.1244, "num_input_tokens_seen": 17667648, "step": 17545 }, { "epoch": 8.274398868458274, "grad_norm": 0.2007923126220703, "learning_rate": 3.6451008394952296e-05, "loss": 0.0589, "num_input_tokens_seen": 17672096, "step": 17550 }, { "epoch": 8.276756247053276, "grad_norm": 1.0173743963241577, "learning_rate": 3.6441863873457204e-05, "loss": 0.0551, "num_input_tokens_seen": 17678400, "step": 17555 }, { "epoch": 8.279113625648279, "grad_norm": 1.4316741228103638, "learning_rate": 3.643271741504884e-05, "loss": 0.1498, "num_input_tokens_seen": 17682752, "step": 17560 }, { "epoch": 8.281471004243281, "grad_norm": 1.5096200704574585, "learning_rate": 3.6423569021275526e-05, "loss": 0.1208, "num_input_tokens_seen": 17686720, "step": 17565 }, { "epoch": 8.283828382838283, "grad_norm": 0.12598179280757904, "learning_rate": 3.641441869368595e-05, "loss": 0.133, "num_input_tokens_seen": 17691904, "step": 17570 }, { "epoch": 8.286185761433286, "grad_norm": 0.03301684930920601, "learning_rate": 3.6405266433829075e-05, "loss": 0.0575, "num_input_tokens_seen": 17697088, "step": 17575 }, { "epoch": 8.288543140028288, "grad_norm": 0.2886093258857727, "learning_rate": 3.639611224325426e-05, "loss": 0.2267, "num_input_tokens_seen": 17704480, "step": 17580 }, { "epoch": 8.29090051862329, "grad_norm": 1.5596786737442017, "learning_rate": 3.638695612351112e-05, "loss": 0.2092, "num_input_tokens_seen": 17709984, "step": 17585 }, { "epoch": 8.293257897218293, "grad_norm": 1.2839192152023315, "learning_rate": 3.637779807614967e-05, "loss": 0.1443, "num_input_tokens_seen": 17715328, "step": 17590 }, { "epoch": 8.295615275813295, "grad_norm": 2.835841655731201, "learning_rate": 3.6368638102720173e-05, "loss": 0.2866, "num_input_tokens_seen": 17719776, "step": 17595 }, { "epoch": 8.297972654408298, "grad_norm": 5.590050220489502, "learning_rate": 3.635947620477328e-05, "loss": 0.2662, "num_input_tokens_seen": 17725376, "step": 17600 }, { "epoch": 8.3003300330033, "grad_norm": 0.07116000354290009, "learning_rate": 3.635031238385993e-05, "loss": 0.1599, "num_input_tokens_seen": 17729600, "step": 17605 }, { "epoch": 8.302687411598303, "grad_norm": 0.035284265875816345, "learning_rate": 3.634114664153143e-05, "loss": 0.1091, "num_input_tokens_seen": 17733984, "step": 17610 }, { "epoch": 8.305044790193305, "grad_norm": 0.46453356742858887, "learning_rate": 3.633197897933934e-05, "loss": 0.1778, "num_input_tokens_seen": 17738912, "step": 17615 }, { "epoch": 8.307402168788308, "grad_norm": 2.231973171234131, "learning_rate": 3.6322809398835636e-05, "loss": 0.1613, "num_input_tokens_seen": 17743136, "step": 17620 }, { "epoch": 8.30975954738331, "grad_norm": 2.3959498405456543, "learning_rate": 3.631363790157254e-05, "loss": 0.0598, "num_input_tokens_seen": 17748608, "step": 17625 }, { "epoch": 8.312116925978312, "grad_norm": 0.05968882516026497, "learning_rate": 3.630446448910265e-05, "loss": 0.0455, "num_input_tokens_seen": 17752576, "step": 17630 }, { "epoch": 8.314474304573315, "grad_norm": 1.296937346458435, "learning_rate": 3.629528916297885e-05, "loss": 0.0619, "num_input_tokens_seen": 17756864, "step": 17635 }, { "epoch": 8.316831683168317, "grad_norm": 1.3668086528778076, "learning_rate": 3.628611192475438e-05, "loss": 0.153, "num_input_tokens_seen": 17761952, "step": 17640 }, { "epoch": 8.31918906176332, "grad_norm": 0.6398258209228516, "learning_rate": 3.62769327759828e-05, "loss": 0.0636, "num_input_tokens_seen": 17765920, "step": 17645 }, { "epoch": 8.321546440358322, "grad_norm": 0.21054799854755402, "learning_rate": 3.6267751718217963e-05, "loss": 0.0813, "num_input_tokens_seen": 17771744, "step": 17650 }, { "epoch": 8.323903818953324, "grad_norm": 0.4061322808265686, "learning_rate": 3.625856875301408e-05, "loss": 0.0617, "num_input_tokens_seen": 17776032, "step": 17655 }, { "epoch": 8.326261197548327, "grad_norm": 0.35701119899749756, "learning_rate": 3.6249383881925655e-05, "loss": 0.0633, "num_input_tokens_seen": 17781696, "step": 17660 }, { "epoch": 8.32861857614333, "grad_norm": 0.30767303705215454, "learning_rate": 3.624019710650755e-05, "loss": 0.0624, "num_input_tokens_seen": 17786688, "step": 17665 }, { "epoch": 8.330975954738332, "grad_norm": 0.08878856152296066, "learning_rate": 3.623100842831493e-05, "loss": 0.1178, "num_input_tokens_seen": 17790912, "step": 17670 }, { "epoch": 8.333333333333334, "grad_norm": 0.42253440618515015, "learning_rate": 3.622181784890327e-05, "loss": 0.0963, "num_input_tokens_seen": 17796832, "step": 17675 }, { "epoch": 8.335690711928336, "grad_norm": 0.08129440993070602, "learning_rate": 3.621262536982838e-05, "loss": 0.1215, "num_input_tokens_seen": 17801280, "step": 17680 }, { "epoch": 8.338048090523339, "grad_norm": 0.8552100658416748, "learning_rate": 3.6203430992646397e-05, "loss": 0.1412, "num_input_tokens_seen": 17805824, "step": 17685 }, { "epoch": 8.340405469118341, "grad_norm": 0.8376390933990479, "learning_rate": 3.619423471891377e-05, "loss": 0.2347, "num_input_tokens_seen": 17811840, "step": 17690 }, { "epoch": 8.342762847713344, "grad_norm": 0.30765798687934875, "learning_rate": 3.618503655018728e-05, "loss": 0.0689, "num_input_tokens_seen": 17816768, "step": 17695 }, { "epoch": 8.345120226308346, "grad_norm": 0.2833087742328644, "learning_rate": 3.6175836488024e-05, "loss": 0.2184, "num_input_tokens_seen": 17821824, "step": 17700 }, { "epoch": 8.347477604903348, "grad_norm": 1.923264980316162, "learning_rate": 3.616663453398136e-05, "loss": 0.1823, "num_input_tokens_seen": 17826400, "step": 17705 }, { "epoch": 8.34983498349835, "grad_norm": 0.08321305364370346, "learning_rate": 3.6157430689617094e-05, "loss": 0.24, "num_input_tokens_seen": 17831264, "step": 17710 }, { "epoch": 8.352192362093351, "grad_norm": 1.3764337301254272, "learning_rate": 3.614822495648926e-05, "loss": 0.2114, "num_input_tokens_seen": 17835840, "step": 17715 }, { "epoch": 8.354549740688354, "grad_norm": 0.14578266441822052, "learning_rate": 3.613901733615622e-05, "loss": 0.084, "num_input_tokens_seen": 17839776, "step": 17720 }, { "epoch": 8.356907119283356, "grad_norm": 0.3549465835094452, "learning_rate": 3.612980783017667e-05, "loss": 0.0482, "num_input_tokens_seen": 17844416, "step": 17725 }, { "epoch": 8.359264497878359, "grad_norm": 1.284973382949829, "learning_rate": 3.612059644010964e-05, "loss": 0.1103, "num_input_tokens_seen": 17849024, "step": 17730 }, { "epoch": 8.361621876473361, "grad_norm": 1.8244534730911255, "learning_rate": 3.611138316751445e-05, "loss": 0.0851, "num_input_tokens_seen": 17853440, "step": 17735 }, { "epoch": 8.363979255068363, "grad_norm": 0.4287204146385193, "learning_rate": 3.610216801395074e-05, "loss": 0.0609, "num_input_tokens_seen": 17858432, "step": 17740 }, { "epoch": 8.366336633663366, "grad_norm": 0.6222121119499207, "learning_rate": 3.60929509809785e-05, "loss": 0.1874, "num_input_tokens_seen": 17864352, "step": 17745 }, { "epoch": 8.368694012258368, "grad_norm": 1.0643922090530396, "learning_rate": 3.6083732070158e-05, "loss": 0.1618, "num_input_tokens_seen": 17869024, "step": 17750 }, { "epoch": 8.37105139085337, "grad_norm": 1.4778835773468018, "learning_rate": 3.6074511283049854e-05, "loss": 0.1328, "num_input_tokens_seen": 17874944, "step": 17755 }, { "epoch": 8.373408769448373, "grad_norm": 1.356952428817749, "learning_rate": 3.606528862121498e-05, "loss": 0.3567, "num_input_tokens_seen": 17879264, "step": 17760 }, { "epoch": 8.375766148043375, "grad_norm": 1.3196312189102173, "learning_rate": 3.605606408621462e-05, "loss": 0.0923, "num_input_tokens_seen": 17884960, "step": 17765 }, { "epoch": 8.378123526638378, "grad_norm": 0.23402398824691772, "learning_rate": 3.6046837679610336e-05, "loss": 0.1031, "num_input_tokens_seen": 17889856, "step": 17770 }, { "epoch": 8.38048090523338, "grad_norm": 0.08185042440891266, "learning_rate": 3.6037609402964e-05, "loss": 0.0554, "num_input_tokens_seen": 17894240, "step": 17775 }, { "epoch": 8.382838283828383, "grad_norm": 0.010784216225147247, "learning_rate": 3.602837925783778e-05, "loss": 0.1388, "num_input_tokens_seen": 17899552, "step": 17780 }, { "epoch": 8.385195662423385, "grad_norm": 1.319926381111145, "learning_rate": 3.601914724579423e-05, "loss": 0.1881, "num_input_tokens_seen": 17904512, "step": 17785 }, { "epoch": 8.387553041018387, "grad_norm": 0.29952919483184814, "learning_rate": 3.600991336839613e-05, "loss": 0.0917, "num_input_tokens_seen": 17910016, "step": 17790 }, { "epoch": 8.38991041961339, "grad_norm": 1.156323790550232, "learning_rate": 3.600067762720663e-05, "loss": 0.083, "num_input_tokens_seen": 17914560, "step": 17795 }, { "epoch": 8.392267798208392, "grad_norm": 1.5386531352996826, "learning_rate": 3.59914400237892e-05, "loss": 0.1636, "num_input_tokens_seen": 17919456, "step": 17800 }, { "epoch": 8.394625176803395, "grad_norm": 0.3286114037036896, "learning_rate": 3.59822005597076e-05, "loss": 0.1512, "num_input_tokens_seen": 17924352, "step": 17805 }, { "epoch": 8.396982555398397, "grad_norm": 0.09767847508192062, "learning_rate": 3.5972959236525904e-05, "loss": 0.1906, "num_input_tokens_seen": 17929024, "step": 17810 }, { "epoch": 8.3993399339934, "grad_norm": 0.3727637529373169, "learning_rate": 3.5963716055808514e-05, "loss": 0.0486, "num_input_tokens_seen": 17933568, "step": 17815 }, { "epoch": 8.401697312588402, "grad_norm": 0.13195613026618958, "learning_rate": 3.5954471019120166e-05, "loss": 0.1695, "num_input_tokens_seen": 17938336, "step": 17820 }, { "epoch": 8.404054691183404, "grad_norm": 0.04855029284954071, "learning_rate": 3.594522412802586e-05, "loss": 0.1429, "num_input_tokens_seen": 17943808, "step": 17825 }, { "epoch": 8.406412069778407, "grad_norm": 1.1278560161590576, "learning_rate": 3.593597538409095e-05, "loss": 0.1245, "num_input_tokens_seen": 17949536, "step": 17830 }, { "epoch": 8.408769448373409, "grad_norm": 0.5418758988380432, "learning_rate": 3.592672478888109e-05, "loss": 0.0773, "num_input_tokens_seen": 17954560, "step": 17835 }, { "epoch": 8.411126826968411, "grad_norm": 0.194697305560112, "learning_rate": 3.591747234396225e-05, "loss": 0.1706, "num_input_tokens_seen": 17960288, "step": 17840 }, { "epoch": 8.413484205563414, "grad_norm": 1.110054850578308, "learning_rate": 3.59082180509007e-05, "loss": 0.1296, "num_input_tokens_seen": 17965056, "step": 17845 }, { "epoch": 8.415841584158416, "grad_norm": 0.3386138677597046, "learning_rate": 3.589896191126306e-05, "loss": 0.047, "num_input_tokens_seen": 17969728, "step": 17850 }, { "epoch": 8.418198962753419, "grad_norm": 1.8036631345748901, "learning_rate": 3.588970392661622e-05, "loss": 0.33, "num_input_tokens_seen": 17975072, "step": 17855 }, { "epoch": 8.420556341348421, "grad_norm": 0.2483869194984436, "learning_rate": 3.588044409852739e-05, "loss": 0.2056, "num_input_tokens_seen": 17980480, "step": 17860 }, { "epoch": 8.422913719943423, "grad_norm": 0.04439937323331833, "learning_rate": 3.5871182428564126e-05, "loss": 0.1559, "num_input_tokens_seen": 17985152, "step": 17865 }, { "epoch": 8.425271098538426, "grad_norm": 1.0526158809661865, "learning_rate": 3.5861918918294255e-05, "loss": 0.0718, "num_input_tokens_seen": 17991264, "step": 17870 }, { "epoch": 8.427628477133428, "grad_norm": 0.574641227722168, "learning_rate": 3.585265356928593e-05, "loss": 0.0755, "num_input_tokens_seen": 17995712, "step": 17875 }, { "epoch": 8.42998585572843, "grad_norm": 1.0967223644256592, "learning_rate": 3.584338638310763e-05, "loss": 0.0614, "num_input_tokens_seen": 18001088, "step": 17880 }, { "epoch": 8.432343234323433, "grad_norm": 0.12885485589504242, "learning_rate": 3.5834117361328124e-05, "loss": 0.0266, "num_input_tokens_seen": 18006368, "step": 17885 }, { "epoch": 8.434700612918435, "grad_norm": 0.22842267155647278, "learning_rate": 3.58248465055165e-05, "loss": 0.0745, "num_input_tokens_seen": 18012128, "step": 17890 }, { "epoch": 8.437057991513438, "grad_norm": 0.7586572766304016, "learning_rate": 3.581557381724216e-05, "loss": 0.1597, "num_input_tokens_seen": 18016384, "step": 17895 }, { "epoch": 8.43941537010844, "grad_norm": 0.3871343731880188, "learning_rate": 3.580629929807481e-05, "loss": 0.1166, "num_input_tokens_seen": 18021696, "step": 17900 }, { "epoch": 8.441772748703443, "grad_norm": 1.7148534059524536, "learning_rate": 3.579702294958448e-05, "loss": 0.1696, "num_input_tokens_seen": 18026592, "step": 17905 }, { "epoch": 8.444130127298443, "grad_norm": 0.1767362654209137, "learning_rate": 3.578774477334148e-05, "loss": 0.0371, "num_input_tokens_seen": 18032160, "step": 17910 }, { "epoch": 8.446487505893446, "grad_norm": 0.031670354306697845, "learning_rate": 3.5778464770916444e-05, "loss": 0.0938, "num_input_tokens_seen": 18036128, "step": 17915 }, { "epoch": 8.448844884488448, "grad_norm": 2.3282272815704346, "learning_rate": 3.5769182943880344e-05, "loss": 0.1998, "num_input_tokens_seen": 18041152, "step": 17920 }, { "epoch": 8.45120226308345, "grad_norm": 0.5493398308753967, "learning_rate": 3.5759899293804416e-05, "loss": 0.0356, "num_input_tokens_seen": 18046144, "step": 17925 }, { "epoch": 8.453559641678453, "grad_norm": 0.5587999820709229, "learning_rate": 3.5750613822260234e-05, "loss": 0.0318, "num_input_tokens_seen": 18051200, "step": 17930 }, { "epoch": 8.455917020273455, "grad_norm": 0.7281860709190369, "learning_rate": 3.574132653081966e-05, "loss": 0.0408, "num_input_tokens_seen": 18057344, "step": 17935 }, { "epoch": 8.458274398868458, "grad_norm": 0.8082342147827148, "learning_rate": 3.5732037421054885e-05, "loss": 0.1748, "num_input_tokens_seen": 18061216, "step": 17940 }, { "epoch": 8.46063177746346, "grad_norm": 0.8741174936294556, "learning_rate": 3.572274649453839e-05, "loss": 0.0485, "num_input_tokens_seen": 18066432, "step": 17945 }, { "epoch": 8.462989156058462, "grad_norm": 0.04341953620314598, "learning_rate": 3.5713453752842986e-05, "loss": 0.0362, "num_input_tokens_seen": 18072512, "step": 17950 }, { "epoch": 8.465346534653465, "grad_norm": 0.8911886811256409, "learning_rate": 3.570415919754176e-05, "loss": 0.1217, "num_input_tokens_seen": 18077120, "step": 17955 }, { "epoch": 8.467703913248467, "grad_norm": 0.8344811797142029, "learning_rate": 3.569486283020813e-05, "loss": 0.0704, "num_input_tokens_seen": 18082720, "step": 17960 }, { "epoch": 8.47006129184347, "grad_norm": 0.08913277834653854, "learning_rate": 3.568556465241581e-05, "loss": 0.1393, "num_input_tokens_seen": 18086848, "step": 17965 }, { "epoch": 8.472418670438472, "grad_norm": 3.2132351398468018, "learning_rate": 3.5676264665738824e-05, "loss": 0.1248, "num_input_tokens_seen": 18094144, "step": 17970 }, { "epoch": 8.474776049033474, "grad_norm": 2.3418257236480713, "learning_rate": 3.5666962871751495e-05, "loss": 0.1912, "num_input_tokens_seen": 18099104, "step": 17975 }, { "epoch": 8.477133427628477, "grad_norm": 0.20735464990139008, "learning_rate": 3.565765927202848e-05, "loss": 0.0493, "num_input_tokens_seen": 18103200, "step": 17980 }, { "epoch": 8.47949080622348, "grad_norm": 0.03282094746828079, "learning_rate": 3.56483538681447e-05, "loss": 0.0654, "num_input_tokens_seen": 18108384, "step": 17985 }, { "epoch": 8.481848184818482, "grad_norm": 1.5464000701904297, "learning_rate": 3.5639046661675414e-05, "loss": 0.1507, "num_input_tokens_seen": 18113184, "step": 17990 }, { "epoch": 8.484205563413484, "grad_norm": 1.0118364095687866, "learning_rate": 3.562973765419616e-05, "loss": 0.0488, "num_input_tokens_seen": 18116992, "step": 17995 }, { "epoch": 8.486562942008486, "grad_norm": 0.1122230589389801, "learning_rate": 3.5620426847282815e-05, "loss": 0.0506, "num_input_tokens_seen": 18122176, "step": 18000 }, { "epoch": 8.488920320603489, "grad_norm": 2.7601125240325928, "learning_rate": 3.561111424251152e-05, "loss": 0.2296, "num_input_tokens_seen": 18127264, "step": 18005 }, { "epoch": 8.491277699198491, "grad_norm": 0.5340060591697693, "learning_rate": 3.560179984145876e-05, "loss": 0.1819, "num_input_tokens_seen": 18132288, "step": 18010 }, { "epoch": 8.493635077793494, "grad_norm": 0.12290558964014053, "learning_rate": 3.559248364570129e-05, "loss": 0.1761, "num_input_tokens_seen": 18137440, "step": 18015 }, { "epoch": 8.495992456388496, "grad_norm": 0.3294444680213928, "learning_rate": 3.558316565681619e-05, "loss": 0.0349, "num_input_tokens_seen": 18142400, "step": 18020 }, { "epoch": 8.498349834983498, "grad_norm": 0.3138546943664551, "learning_rate": 3.557384587638083e-05, "loss": 0.137, "num_input_tokens_seen": 18146720, "step": 18025 }, { "epoch": 8.500707213578501, "grad_norm": 2.350324869155884, "learning_rate": 3.55645243059729e-05, "loss": 0.2201, "num_input_tokens_seen": 18151040, "step": 18030 }, { "epoch": 8.503064592173503, "grad_norm": 0.8162368535995483, "learning_rate": 3.555520094717038e-05, "loss": 0.2598, "num_input_tokens_seen": 18155456, "step": 18035 }, { "epoch": 8.505421970768506, "grad_norm": 0.5092465281486511, "learning_rate": 3.554587580155156e-05, "loss": 0.2028, "num_input_tokens_seen": 18160640, "step": 18040 }, { "epoch": 8.507779349363508, "grad_norm": 2.142807960510254, "learning_rate": 3.553654887069502e-05, "loss": 0.134, "num_input_tokens_seen": 18166784, "step": 18045 }, { "epoch": 8.51013672795851, "grad_norm": 0.16759848594665527, "learning_rate": 3.5527220156179664e-05, "loss": 0.182, "num_input_tokens_seen": 18171648, "step": 18050 }, { "epoch": 8.512494106553513, "grad_norm": 0.7119427919387817, "learning_rate": 3.5517889659584675e-05, "loss": 0.1535, "num_input_tokens_seen": 18176384, "step": 18055 }, { "epoch": 8.514851485148515, "grad_norm": 0.02108835242688656, "learning_rate": 3.550855738248955e-05, "loss": 0.1017, "num_input_tokens_seen": 18182272, "step": 18060 }, { "epoch": 8.517208863743518, "grad_norm": 0.24812133610248566, "learning_rate": 3.549922332647408e-05, "loss": 0.2218, "num_input_tokens_seen": 18186848, "step": 18065 }, { "epoch": 8.51956624233852, "grad_norm": 1.1209222078323364, "learning_rate": 3.548988749311838e-05, "loss": 0.2262, "num_input_tokens_seen": 18191104, "step": 18070 }, { "epoch": 8.521923620933523, "grad_norm": 0.07954873144626617, "learning_rate": 3.5480549884002826e-05, "loss": 0.0861, "num_input_tokens_seen": 18195936, "step": 18075 }, { "epoch": 8.524280999528525, "grad_norm": 0.019256867468357086, "learning_rate": 3.5471210500708125e-05, "loss": 0.0368, "num_input_tokens_seen": 18200960, "step": 18080 }, { "epoch": 8.526638378123527, "grad_norm": 0.7369322180747986, "learning_rate": 3.546186934481528e-05, "loss": 0.1426, "num_input_tokens_seen": 18206880, "step": 18085 }, { "epoch": 8.52899575671853, "grad_norm": 0.9250702857971191, "learning_rate": 3.545252641790558e-05, "loss": 0.164, "num_input_tokens_seen": 18211072, "step": 18090 }, { "epoch": 8.531353135313532, "grad_norm": 0.7893537282943726, "learning_rate": 3.544318172156065e-05, "loss": 0.1881, "num_input_tokens_seen": 18215392, "step": 18095 }, { "epoch": 8.533710513908535, "grad_norm": 0.1739516705274582, "learning_rate": 3.543383525736235e-05, "loss": 0.0527, "num_input_tokens_seen": 18221248, "step": 18100 }, { "epoch": 8.536067892503535, "grad_norm": 1.9700899124145508, "learning_rate": 3.542448702689291e-05, "loss": 0.178, "num_input_tokens_seen": 18226816, "step": 18105 }, { "epoch": 8.53842527109854, "grad_norm": 1.5040606260299683, "learning_rate": 3.5415137031734815e-05, "loss": 0.1212, "num_input_tokens_seen": 18231488, "step": 18110 }, { "epoch": 8.54078264969354, "grad_norm": 0.39445629715919495, "learning_rate": 3.5405785273470846e-05, "loss": 0.05, "num_input_tokens_seen": 18236032, "step": 18115 }, { "epoch": 8.543140028288542, "grad_norm": 0.6313861012458801, "learning_rate": 3.539643175368412e-05, "loss": 0.0478, "num_input_tokens_seen": 18240640, "step": 18120 }, { "epoch": 8.545497406883545, "grad_norm": 1.5737926959991455, "learning_rate": 3.538707647395802e-05, "loss": 0.1519, "num_input_tokens_seen": 18246848, "step": 18125 }, { "epoch": 8.547854785478547, "grad_norm": 0.17581146955490112, "learning_rate": 3.537771943587623e-05, "loss": 0.0435, "num_input_tokens_seen": 18252768, "step": 18130 }, { "epoch": 8.55021216407355, "grad_norm": 0.354053795337677, "learning_rate": 3.536836064102275e-05, "loss": 0.1155, "num_input_tokens_seen": 18257312, "step": 18135 }, { "epoch": 8.552569542668552, "grad_norm": 0.4754999876022339, "learning_rate": 3.535900009098185e-05, "loss": 0.0906, "num_input_tokens_seen": 18263456, "step": 18140 }, { "epoch": 8.554926921263554, "grad_norm": 0.5133696794509888, "learning_rate": 3.534963778733813e-05, "loss": 0.049, "num_input_tokens_seen": 18268320, "step": 18145 }, { "epoch": 8.557284299858557, "grad_norm": 0.053798843175172806, "learning_rate": 3.5340273731676456e-05, "loss": 0.275, "num_input_tokens_seen": 18272256, "step": 18150 }, { "epoch": 8.55964167845356, "grad_norm": 0.5307068824768066, "learning_rate": 3.533090792558201e-05, "loss": 0.2328, "num_input_tokens_seen": 18276512, "step": 18155 }, { "epoch": 8.561999057048562, "grad_norm": 0.13467656075954437, "learning_rate": 3.532154037064025e-05, "loss": 0.0741, "num_input_tokens_seen": 18282688, "step": 18160 }, { "epoch": 8.564356435643564, "grad_norm": 0.14243197441101074, "learning_rate": 3.531217106843696e-05, "loss": 0.1447, "num_input_tokens_seen": 18287584, "step": 18165 }, { "epoch": 8.566713814238566, "grad_norm": 0.872307300567627, "learning_rate": 3.5302800020558205e-05, "loss": 0.1496, "num_input_tokens_seen": 18292768, "step": 18170 }, { "epoch": 8.569071192833569, "grad_norm": 0.9907553791999817, "learning_rate": 3.529342722859034e-05, "loss": 0.1955, "num_input_tokens_seen": 18297920, "step": 18175 }, { "epoch": 8.571428571428571, "grad_norm": 0.044342055916786194, "learning_rate": 3.5284052694120005e-05, "loss": 0.1051, "num_input_tokens_seen": 18302304, "step": 18180 }, { "epoch": 8.573785950023574, "grad_norm": 0.22661615908145905, "learning_rate": 3.5274676418734163e-05, "loss": 0.0782, "num_input_tokens_seen": 18307264, "step": 18185 }, { "epoch": 8.576143328618576, "grad_norm": 0.8734410405158997, "learning_rate": 3.526529840402006e-05, "loss": 0.1298, "num_input_tokens_seen": 18312640, "step": 18190 }, { "epoch": 8.578500707213578, "grad_norm": 1.8717548847198486, "learning_rate": 3.5255918651565246e-05, "loss": 0.1589, "num_input_tokens_seen": 18316928, "step": 18195 }, { "epoch": 8.58085808580858, "grad_norm": 2.113954782485962, "learning_rate": 3.524653716295753e-05, "loss": 0.2473, "num_input_tokens_seen": 18323072, "step": 18200 }, { "epoch": 8.583215464403583, "grad_norm": 0.3389100134372711, "learning_rate": 3.523715393978505e-05, "loss": 0.0977, "num_input_tokens_seen": 18327296, "step": 18205 }, { "epoch": 8.585572842998586, "grad_norm": 0.4123916029930115, "learning_rate": 3.522776898363621e-05, "loss": 0.0982, "num_input_tokens_seen": 18331168, "step": 18210 }, { "epoch": 8.587930221593588, "grad_norm": 0.17995981872081757, "learning_rate": 3.521838229609976e-05, "loss": 0.092, "num_input_tokens_seen": 18337248, "step": 18215 }, { "epoch": 8.59028760018859, "grad_norm": 1.0813850164413452, "learning_rate": 3.5208993878764674e-05, "loss": 0.0719, "num_input_tokens_seen": 18342240, "step": 18220 }, { "epoch": 8.592644978783593, "grad_norm": 0.08525243401527405, "learning_rate": 3.519960373322026e-05, "loss": 0.063, "num_input_tokens_seen": 18347040, "step": 18225 }, { "epoch": 8.595002357378595, "grad_norm": 0.3404823839664459, "learning_rate": 3.5190211861056124e-05, "loss": 0.2234, "num_input_tokens_seen": 18352320, "step": 18230 }, { "epoch": 8.597359735973598, "grad_norm": 0.4380449652671814, "learning_rate": 3.5180818263862137e-05, "loss": 0.2042, "num_input_tokens_seen": 18357280, "step": 18235 }, { "epoch": 8.5997171145686, "grad_norm": 1.9240800142288208, "learning_rate": 3.517142294322848e-05, "loss": 0.1148, "num_input_tokens_seen": 18361920, "step": 18240 }, { "epoch": 8.602074493163602, "grad_norm": 0.8406971096992493, "learning_rate": 3.516202590074562e-05, "loss": 0.1532, "num_input_tokens_seen": 18366720, "step": 18245 }, { "epoch": 8.604431871758605, "grad_norm": 0.1658288538455963, "learning_rate": 3.515262713800431e-05, "loss": 0.0955, "num_input_tokens_seen": 18371776, "step": 18250 }, { "epoch": 8.606789250353607, "grad_norm": 0.12369490414857864, "learning_rate": 3.5143226656595604e-05, "loss": 0.0759, "num_input_tokens_seen": 18376544, "step": 18255 }, { "epoch": 8.60914662894861, "grad_norm": 1.0525755882263184, "learning_rate": 3.5133824458110856e-05, "loss": 0.1676, "num_input_tokens_seen": 18380704, "step": 18260 }, { "epoch": 8.611504007543612, "grad_norm": 0.41466283798217773, "learning_rate": 3.512442054414169e-05, "loss": 0.0673, "num_input_tokens_seen": 18384800, "step": 18265 }, { "epoch": 8.613861386138614, "grad_norm": 0.4920143783092499, "learning_rate": 3.511501491628002e-05, "loss": 0.1202, "num_input_tokens_seen": 18390848, "step": 18270 }, { "epoch": 8.616218764733617, "grad_norm": 0.4142104685306549, "learning_rate": 3.510560757611807e-05, "loss": 0.1822, "num_input_tokens_seen": 18396064, "step": 18275 }, { "epoch": 8.61857614332862, "grad_norm": 1.4186348915100098, "learning_rate": 3.5096198525248355e-05, "loss": 0.3791, "num_input_tokens_seen": 18401696, "step": 18280 }, { "epoch": 8.620933521923622, "grad_norm": 0.7716480493545532, "learning_rate": 3.508678776526364e-05, "loss": 0.0985, "num_input_tokens_seen": 18406752, "step": 18285 }, { "epoch": 8.623290900518624, "grad_norm": 0.30659040808677673, "learning_rate": 3.507737529775702e-05, "loss": 0.0627, "num_input_tokens_seen": 18411552, "step": 18290 }, { "epoch": 8.625648279113626, "grad_norm": 1.2610150575637817, "learning_rate": 3.506796112432187e-05, "loss": 0.1923, "num_input_tokens_seen": 18416256, "step": 18295 }, { "epoch": 8.628005657708629, "grad_norm": 0.5734191536903381, "learning_rate": 3.505854524655184e-05, "loss": 0.0434, "num_input_tokens_seen": 18421440, "step": 18300 }, { "epoch": 8.630363036303631, "grad_norm": 0.889215886592865, "learning_rate": 3.5049127666040895e-05, "loss": 0.2011, "num_input_tokens_seen": 18426912, "step": 18305 }, { "epoch": 8.632720414898632, "grad_norm": 0.3021644949913025, "learning_rate": 3.503970838438325e-05, "loss": 0.1422, "num_input_tokens_seen": 18430624, "step": 18310 }, { "epoch": 8.635077793493634, "grad_norm": 0.6126091480255127, "learning_rate": 3.5030287403173446e-05, "loss": 0.2197, "num_input_tokens_seen": 18436576, "step": 18315 }, { "epoch": 8.637435172088637, "grad_norm": 1.5514931678771973, "learning_rate": 3.5020864724006294e-05, "loss": 0.1287, "num_input_tokens_seen": 18441280, "step": 18320 }, { "epoch": 8.639792550683639, "grad_norm": 0.7274057865142822, "learning_rate": 3.501144034847688e-05, "loss": 0.0478, "num_input_tokens_seen": 18446464, "step": 18325 }, { "epoch": 8.642149929278641, "grad_norm": 0.04332266375422478, "learning_rate": 3.50020142781806e-05, "loss": 0.082, "num_input_tokens_seen": 18451840, "step": 18330 }, { "epoch": 8.644507307873644, "grad_norm": 1.7758874893188477, "learning_rate": 3.499258651471313e-05, "loss": 0.1319, "num_input_tokens_seen": 18456416, "step": 18335 }, { "epoch": 8.646864686468646, "grad_norm": 0.39173999428749084, "learning_rate": 3.498315705967043e-05, "loss": 0.1047, "num_input_tokens_seen": 18461312, "step": 18340 }, { "epoch": 8.649222065063649, "grad_norm": 2.240185499191284, "learning_rate": 3.4973725914648743e-05, "loss": 0.1747, "num_input_tokens_seen": 18465664, "step": 18345 }, { "epoch": 8.651579443658651, "grad_norm": 0.5272642970085144, "learning_rate": 3.49642930812446e-05, "loss": 0.0531, "num_input_tokens_seen": 18471168, "step": 18350 }, { "epoch": 8.653936822253653, "grad_norm": 0.3414381742477417, "learning_rate": 3.495485856105483e-05, "loss": 0.2008, "num_input_tokens_seen": 18475552, "step": 18355 }, { "epoch": 8.656294200848656, "grad_norm": 0.5766698718070984, "learning_rate": 3.494542235567652e-05, "loss": 0.045, "num_input_tokens_seen": 18481120, "step": 18360 }, { "epoch": 8.658651579443658, "grad_norm": 1.0052047967910767, "learning_rate": 3.493598446670707e-05, "loss": 0.0971, "num_input_tokens_seen": 18485632, "step": 18365 }, { "epoch": 8.66100895803866, "grad_norm": 0.04645088315010071, "learning_rate": 3.492654489574416e-05, "loss": 0.0626, "num_input_tokens_seen": 18490304, "step": 18370 }, { "epoch": 8.663366336633663, "grad_norm": 2.6309683322906494, "learning_rate": 3.491710364438573e-05, "loss": 0.2569, "num_input_tokens_seen": 18496416, "step": 18375 }, { "epoch": 8.665723715228665, "grad_norm": 0.4723461866378784, "learning_rate": 3.490766071423005e-05, "loss": 0.1205, "num_input_tokens_seen": 18503072, "step": 18380 }, { "epoch": 8.668081093823668, "grad_norm": 1.503570795059204, "learning_rate": 3.489821610687562e-05, "loss": 0.1031, "num_input_tokens_seen": 18508352, "step": 18385 }, { "epoch": 8.67043847241867, "grad_norm": 0.21374750137329102, "learning_rate": 3.488876982392127e-05, "loss": 0.0509, "num_input_tokens_seen": 18512448, "step": 18390 }, { "epoch": 8.672795851013673, "grad_norm": 0.33243876695632935, "learning_rate": 3.48793218669661e-05, "loss": 0.0639, "num_input_tokens_seen": 18517856, "step": 18395 }, { "epoch": 8.675153229608675, "grad_norm": 0.1644972413778305, "learning_rate": 3.486987223760947e-05, "loss": 0.0845, "num_input_tokens_seen": 18522560, "step": 18400 }, { "epoch": 8.677510608203677, "grad_norm": 0.08842208236455917, "learning_rate": 3.4860420937451055e-05, "loss": 0.1104, "num_input_tokens_seen": 18526784, "step": 18405 }, { "epoch": 8.67986798679868, "grad_norm": 1.4746118783950806, "learning_rate": 3.485096796809079e-05, "loss": 0.1824, "num_input_tokens_seen": 18531104, "step": 18410 }, { "epoch": 8.682225365393682, "grad_norm": 0.22655731439590454, "learning_rate": 3.4841513331128914e-05, "loss": 0.1684, "num_input_tokens_seen": 18535520, "step": 18415 }, { "epoch": 8.684582743988685, "grad_norm": 0.7523722052574158, "learning_rate": 3.483205702816593e-05, "loss": 0.0705, "num_input_tokens_seen": 18541056, "step": 18420 }, { "epoch": 8.686940122583687, "grad_norm": 0.17158839106559753, "learning_rate": 3.482259906080263e-05, "loss": 0.0442, "num_input_tokens_seen": 18546336, "step": 18425 }, { "epoch": 8.68929750117869, "grad_norm": 0.764976978302002, "learning_rate": 3.481313943064008e-05, "loss": 0.0592, "num_input_tokens_seen": 18552544, "step": 18430 }, { "epoch": 8.691654879773692, "grad_norm": 1.8960202932357788, "learning_rate": 3.480367813927965e-05, "loss": 0.1747, "num_input_tokens_seen": 18558016, "step": 18435 }, { "epoch": 8.694012258368694, "grad_norm": 0.1642693430185318, "learning_rate": 3.4794215188322964e-05, "loss": 0.0829, "num_input_tokens_seen": 18563040, "step": 18440 }, { "epoch": 8.696369636963697, "grad_norm": 0.4494180381298065, "learning_rate": 3.478475057937194e-05, "loss": 0.2105, "num_input_tokens_seen": 18567584, "step": 18445 }, { "epoch": 8.698727015558699, "grad_norm": 0.0405014269053936, "learning_rate": 3.477528431402879e-05, "loss": 0.2731, "num_input_tokens_seen": 18573888, "step": 18450 }, { "epoch": 8.701084394153701, "grad_norm": 0.8383165597915649, "learning_rate": 3.4765816393895964e-05, "loss": 0.0953, "num_input_tokens_seen": 18577952, "step": 18455 }, { "epoch": 8.703441772748704, "grad_norm": 0.6359530091285706, "learning_rate": 3.475634682057625e-05, "loss": 0.0781, "num_input_tokens_seen": 18582048, "step": 18460 }, { "epoch": 8.705799151343706, "grad_norm": 0.25334203243255615, "learning_rate": 3.474687559567268e-05, "loss": 0.0425, "num_input_tokens_seen": 18586912, "step": 18465 }, { "epoch": 8.708156529938709, "grad_norm": 0.4455842673778534, "learning_rate": 3.473740272078855e-05, "loss": 0.034, "num_input_tokens_seen": 18591904, "step": 18470 }, { "epoch": 8.710513908533711, "grad_norm": 0.9983137845993042, "learning_rate": 3.472792819752747e-05, "loss": 0.0671, "num_input_tokens_seen": 18597056, "step": 18475 }, { "epoch": 8.712871287128714, "grad_norm": 0.15858842432498932, "learning_rate": 3.471845202749332e-05, "loss": 0.1088, "num_input_tokens_seen": 18602240, "step": 18480 }, { "epoch": 8.715228665723716, "grad_norm": 1.56969153881073, "learning_rate": 3.470897421229026e-05, "loss": 0.1453, "num_input_tokens_seen": 18608800, "step": 18485 }, { "epoch": 8.717586044318718, "grad_norm": 0.10398919880390167, "learning_rate": 3.46994947535227e-05, "loss": 0.0513, "num_input_tokens_seen": 18613184, "step": 18490 }, { "epoch": 8.71994342291372, "grad_norm": 1.637954592704773, "learning_rate": 3.4690013652795374e-05, "loss": 0.2321, "num_input_tokens_seen": 18617728, "step": 18495 }, { "epoch": 8.722300801508723, "grad_norm": 2.045713186264038, "learning_rate": 3.468053091171326e-05, "loss": 0.203, "num_input_tokens_seen": 18622624, "step": 18500 }, { "epoch": 8.724658180103724, "grad_norm": 0.6101592779159546, "learning_rate": 3.4671046531881633e-05, "loss": 0.1017, "num_input_tokens_seen": 18627712, "step": 18505 }, { "epoch": 8.727015558698728, "grad_norm": 0.2326946258544922, "learning_rate": 3.4661560514906034e-05, "loss": 0.1638, "num_input_tokens_seen": 18632928, "step": 18510 }, { "epoch": 8.729372937293729, "grad_norm": 0.5669156908988953, "learning_rate": 3.465207286239228e-05, "loss": 0.0945, "num_input_tokens_seen": 18639264, "step": 18515 }, { "epoch": 8.731730315888731, "grad_norm": 1.339853048324585, "learning_rate": 3.4642583575946476e-05, "loss": 0.0544, "num_input_tokens_seen": 18643776, "step": 18520 }, { "epoch": 8.734087694483733, "grad_norm": 0.7171821594238281, "learning_rate": 3.4633092657174995e-05, "loss": 0.1805, "num_input_tokens_seen": 18648192, "step": 18525 }, { "epoch": 8.736445073078736, "grad_norm": 0.6744474768638611, "learning_rate": 3.462360010768448e-05, "loss": 0.0713, "num_input_tokens_seen": 18653120, "step": 18530 }, { "epoch": 8.738802451673738, "grad_norm": 0.25476813316345215, "learning_rate": 3.4614105929081876e-05, "loss": 0.2582, "num_input_tokens_seen": 18658208, "step": 18535 }, { "epoch": 8.74115983026874, "grad_norm": 0.13586845993995667, "learning_rate": 3.4604610122974376e-05, "loss": 0.1146, "num_input_tokens_seen": 18662944, "step": 18540 }, { "epoch": 8.743517208863743, "grad_norm": 1.006264090538025, "learning_rate": 3.4595112690969454e-05, "loss": 0.1617, "num_input_tokens_seen": 18668608, "step": 18545 }, { "epoch": 8.745874587458745, "grad_norm": 2.725001335144043, "learning_rate": 3.4585613634674874e-05, "loss": 0.0993, "num_input_tokens_seen": 18673792, "step": 18550 }, { "epoch": 8.748231966053748, "grad_norm": 0.7074993848800659, "learning_rate": 3.457611295569865e-05, "loss": 0.0984, "num_input_tokens_seen": 18678848, "step": 18555 }, { "epoch": 8.75058934464875, "grad_norm": 1.3748050928115845, "learning_rate": 3.456661065564911e-05, "loss": 0.2667, "num_input_tokens_seen": 18682944, "step": 18560 }, { "epoch": 8.752946723243753, "grad_norm": 0.26122555136680603, "learning_rate": 3.455710673613481e-05, "loss": 0.1882, "num_input_tokens_seen": 18688032, "step": 18565 }, { "epoch": 8.755304101838755, "grad_norm": 0.4770769476890564, "learning_rate": 3.454760119876461e-05, "loss": 0.1257, "num_input_tokens_seen": 18693760, "step": 18570 }, { "epoch": 8.757661480433757, "grad_norm": 0.35041749477386475, "learning_rate": 3.453809404514764e-05, "loss": 0.0325, "num_input_tokens_seen": 18698464, "step": 18575 }, { "epoch": 8.76001885902876, "grad_norm": 1.4902660846710205, "learning_rate": 3.4528585276893294e-05, "loss": 0.1045, "num_input_tokens_seen": 18704096, "step": 18580 }, { "epoch": 8.762376237623762, "grad_norm": 1.8144699335098267, "learning_rate": 3.4519074895611244e-05, "loss": 0.1154, "num_input_tokens_seen": 18709408, "step": 18585 }, { "epoch": 8.764733616218765, "grad_norm": 1.136695146560669, "learning_rate": 3.450956290291144e-05, "loss": 0.1484, "num_input_tokens_seen": 18713920, "step": 18590 }, { "epoch": 8.767090994813767, "grad_norm": 1.174445629119873, "learning_rate": 3.45000493004041e-05, "loss": 0.0957, "num_input_tokens_seen": 18717952, "step": 18595 }, { "epoch": 8.76944837340877, "grad_norm": 0.5709683895111084, "learning_rate": 3.4490534089699716e-05, "loss": 0.1871, "num_input_tokens_seen": 18722208, "step": 18600 }, { "epoch": 8.771805752003772, "grad_norm": 0.028930647298693657, "learning_rate": 3.448101727240905e-05, "loss": 0.0258, "num_input_tokens_seen": 18726848, "step": 18605 }, { "epoch": 8.774163130598774, "grad_norm": 1.5078940391540527, "learning_rate": 3.447149885014314e-05, "loss": 0.244, "num_input_tokens_seen": 18731808, "step": 18610 }, { "epoch": 8.776520509193777, "grad_norm": 1.1049538850784302, "learning_rate": 3.4461978824513285e-05, "loss": 0.1282, "num_input_tokens_seen": 18736128, "step": 18615 }, { "epoch": 8.778877887788779, "grad_norm": 1.4034024477005005, "learning_rate": 3.445245719713108e-05, "loss": 0.1135, "num_input_tokens_seen": 18741024, "step": 18620 }, { "epoch": 8.781235266383781, "grad_norm": 0.2232900708913803, "learning_rate": 3.444293396960835e-05, "loss": 0.1502, "num_input_tokens_seen": 18747968, "step": 18625 }, { "epoch": 8.783592644978784, "grad_norm": 0.07353267818689346, "learning_rate": 3.443340914355725e-05, "loss": 0.0265, "num_input_tokens_seen": 18752672, "step": 18630 }, { "epoch": 8.785950023573786, "grad_norm": 0.22100739181041718, "learning_rate": 3.442388272059014e-05, "loss": 0.0381, "num_input_tokens_seen": 18756928, "step": 18635 }, { "epoch": 8.788307402168789, "grad_norm": 0.9987851977348328, "learning_rate": 3.4414354702319694e-05, "loss": 0.0445, "num_input_tokens_seen": 18760864, "step": 18640 }, { "epoch": 8.790664780763791, "grad_norm": 0.4529334008693695, "learning_rate": 3.440482509035885e-05, "loss": 0.0765, "num_input_tokens_seen": 18765664, "step": 18645 }, { "epoch": 8.793022159358793, "grad_norm": 1.0168914794921875, "learning_rate": 3.43952938863208e-05, "loss": 0.1795, "num_input_tokens_seen": 18771456, "step": 18650 }, { "epoch": 8.795379537953796, "grad_norm": 0.11118575185537338, "learning_rate": 3.438576109181902e-05, "loss": 0.1398, "num_input_tokens_seen": 18776000, "step": 18655 }, { "epoch": 8.797736916548798, "grad_norm": 0.24065493047237396, "learning_rate": 3.437622670846724e-05, "loss": 0.1018, "num_input_tokens_seen": 18780704, "step": 18660 }, { "epoch": 8.8000942951438, "grad_norm": 2.147312641143799, "learning_rate": 3.436669073787949e-05, "loss": 0.1416, "num_input_tokens_seen": 18786176, "step": 18665 }, { "epoch": 8.802451673738803, "grad_norm": 0.1657651662826538, "learning_rate": 3.435715318167003e-05, "loss": 0.0476, "num_input_tokens_seen": 18790400, "step": 18670 }, { "epoch": 8.804809052333805, "grad_norm": 0.9031980633735657, "learning_rate": 3.434761404145341e-05, "loss": 0.1089, "num_input_tokens_seen": 18795968, "step": 18675 }, { "epoch": 8.807166430928808, "grad_norm": 1.2794586420059204, "learning_rate": 3.433807331884445e-05, "loss": 0.1764, "num_input_tokens_seen": 18800960, "step": 18680 }, { "epoch": 8.80952380952381, "grad_norm": 1.483898639678955, "learning_rate": 3.432853101545822e-05, "loss": 0.067, "num_input_tokens_seen": 18805696, "step": 18685 }, { "epoch": 8.811881188118813, "grad_norm": 1.0153007507324219, "learning_rate": 3.431898713291009e-05, "loss": 0.0312, "num_input_tokens_seen": 18810784, "step": 18690 }, { "epoch": 8.814238566713815, "grad_norm": 2.0924830436706543, "learning_rate": 3.430944167281566e-05, "loss": 0.1546, "num_input_tokens_seen": 18816288, "step": 18695 }, { "epoch": 8.816595945308817, "grad_norm": 0.9805338382720947, "learning_rate": 3.429989463679082e-05, "loss": 0.0749, "num_input_tokens_seen": 18820928, "step": 18700 }, { "epoch": 8.81895332390382, "grad_norm": 2.0118753910064697, "learning_rate": 3.429034602645171e-05, "loss": 0.0832, "num_input_tokens_seen": 18824352, "step": 18705 }, { "epoch": 8.82131070249882, "grad_norm": 0.10135167837142944, "learning_rate": 3.428079584341477e-05, "loss": 0.1121, "num_input_tokens_seen": 18828992, "step": 18710 }, { "epoch": 8.823668081093825, "grad_norm": 0.5276851058006287, "learning_rate": 3.4271244089296685e-05, "loss": 0.1384, "num_input_tokens_seen": 18834656, "step": 18715 }, { "epoch": 8.826025459688825, "grad_norm": 0.14496760070323944, "learning_rate": 3.426169076571437e-05, "loss": 0.0863, "num_input_tokens_seen": 18840096, "step": 18720 }, { "epoch": 8.828382838283828, "grad_norm": 0.7714778780937195, "learning_rate": 3.425213587428507e-05, "loss": 0.1548, "num_input_tokens_seen": 18845472, "step": 18725 }, { "epoch": 8.83074021687883, "grad_norm": 0.5847660303115845, "learning_rate": 3.4242579416626266e-05, "loss": 0.066, "num_input_tokens_seen": 18849792, "step": 18730 }, { "epoch": 8.833097595473832, "grad_norm": 0.08694411814212799, "learning_rate": 3.42330213943557e-05, "loss": 0.0313, "num_input_tokens_seen": 18854624, "step": 18735 }, { "epoch": 8.835454974068835, "grad_norm": 1.6755056381225586, "learning_rate": 3.422346180909137e-05, "loss": 0.1741, "num_input_tokens_seen": 18859328, "step": 18740 }, { "epoch": 8.837812352663837, "grad_norm": 1.012387990951538, "learning_rate": 3.421390066245156e-05, "loss": 0.0705, "num_input_tokens_seen": 18864640, "step": 18745 }, { "epoch": 8.84016973125884, "grad_norm": 1.569052815437317, "learning_rate": 3.420433795605481e-05, "loss": 0.2898, "num_input_tokens_seen": 18869376, "step": 18750 }, { "epoch": 8.842527109853842, "grad_norm": 1.364652156829834, "learning_rate": 3.4194773691519934e-05, "loss": 0.1117, "num_input_tokens_seen": 18873920, "step": 18755 }, { "epoch": 8.844884488448844, "grad_norm": 0.8740851879119873, "learning_rate": 3.418520787046598e-05, "loss": 0.1165, "num_input_tokens_seen": 18879488, "step": 18760 }, { "epoch": 8.847241867043847, "grad_norm": 0.4926604926586151, "learning_rate": 3.41756404945123e-05, "loss": 0.0742, "num_input_tokens_seen": 18884768, "step": 18765 }, { "epoch": 8.84959924563885, "grad_norm": 0.06398383527994156, "learning_rate": 3.416607156527847e-05, "loss": 0.1545, "num_input_tokens_seen": 18888992, "step": 18770 }, { "epoch": 8.851956624233852, "grad_norm": 0.14741621911525726, "learning_rate": 3.415650108438436e-05, "loss": 0.0189, "num_input_tokens_seen": 18893664, "step": 18775 }, { "epoch": 8.854314002828854, "grad_norm": 1.6459076404571533, "learning_rate": 3.4146929053450086e-05, "loss": 0.3648, "num_input_tokens_seen": 18899328, "step": 18780 }, { "epoch": 8.856671381423856, "grad_norm": 1.0151091814041138, "learning_rate": 3.413735547409602e-05, "loss": 0.1617, "num_input_tokens_seen": 18904320, "step": 18785 }, { "epoch": 8.859028760018859, "grad_norm": 0.2718338668346405, "learning_rate": 3.4127780347942826e-05, "loss": 0.201, "num_input_tokens_seen": 18909056, "step": 18790 }, { "epoch": 8.861386138613861, "grad_norm": 0.8208471536636353, "learning_rate": 3.41182036766114e-05, "loss": 0.1293, "num_input_tokens_seen": 18913536, "step": 18795 }, { "epoch": 8.863743517208864, "grad_norm": 0.22767767310142517, "learning_rate": 3.410862546172291e-05, "loss": 0.2077, "num_input_tokens_seen": 18919456, "step": 18800 }, { "epoch": 8.866100895803866, "grad_norm": 1.2012017965316772, "learning_rate": 3.4099045704898775e-05, "loss": 0.1453, "num_input_tokens_seen": 18924512, "step": 18805 }, { "epoch": 8.868458274398868, "grad_norm": 1.2353620529174805, "learning_rate": 3.40894644077607e-05, "loss": 0.0468, "num_input_tokens_seen": 18929344, "step": 18810 }, { "epoch": 8.87081565299387, "grad_norm": 0.8771470189094543, "learning_rate": 3.407988157193063e-05, "loss": 0.114, "num_input_tokens_seen": 18934560, "step": 18815 }, { "epoch": 8.873173031588873, "grad_norm": 0.09906590729951859, "learning_rate": 3.4070297199030784e-05, "loss": 0.0555, "num_input_tokens_seen": 18940256, "step": 18820 }, { "epoch": 8.875530410183876, "grad_norm": 1.174039363861084, "learning_rate": 3.406071129068362e-05, "loss": 0.21, "num_input_tokens_seen": 18946272, "step": 18825 }, { "epoch": 8.877887788778878, "grad_norm": 0.7504554986953735, "learning_rate": 3.405112384851188e-05, "loss": 0.19, "num_input_tokens_seen": 18952576, "step": 18830 }, { "epoch": 8.88024516737388, "grad_norm": 0.08355976641178131, "learning_rate": 3.4041534874138546e-05, "loss": 0.0501, "num_input_tokens_seen": 18957344, "step": 18835 }, { "epoch": 8.882602545968883, "grad_norm": 0.9659743905067444, "learning_rate": 3.403194436918689e-05, "loss": 0.0393, "num_input_tokens_seen": 18962144, "step": 18840 }, { "epoch": 8.884959924563885, "grad_norm": 2.295945882797241, "learning_rate": 3.402235233528039e-05, "loss": 0.1864, "num_input_tokens_seen": 18966816, "step": 18845 }, { "epoch": 8.887317303158888, "grad_norm": 1.337828516960144, "learning_rate": 3.4012758774042837e-05, "loss": 0.1062, "num_input_tokens_seen": 18970496, "step": 18850 }, { "epoch": 8.88967468175389, "grad_norm": 1.2175214290618896, "learning_rate": 3.4003163687098245e-05, "loss": 0.1105, "num_input_tokens_seen": 18975264, "step": 18855 }, { "epoch": 8.892032060348892, "grad_norm": 0.6749551892280579, "learning_rate": 3.399356707607091e-05, "loss": 0.1281, "num_input_tokens_seen": 18979808, "step": 18860 }, { "epoch": 8.894389438943895, "grad_norm": 0.10744091123342514, "learning_rate": 3.3983968942585376e-05, "loss": 0.0702, "num_input_tokens_seen": 18984832, "step": 18865 }, { "epoch": 8.896746817538897, "grad_norm": 0.8022431135177612, "learning_rate": 3.397436928826643e-05, "loss": 0.0555, "num_input_tokens_seen": 18990752, "step": 18870 }, { "epoch": 8.8991041961339, "grad_norm": 1.6736953258514404, "learning_rate": 3.396476811473915e-05, "loss": 0.2359, "num_input_tokens_seen": 18995616, "step": 18875 }, { "epoch": 8.901461574728902, "grad_norm": 0.012878036126494408, "learning_rate": 3.395516542362884e-05, "loss": 0.0442, "num_input_tokens_seen": 19000320, "step": 18880 }, { "epoch": 8.903818953323904, "grad_norm": 0.0783785730600357, "learning_rate": 3.3945561216561065e-05, "loss": 0.037, "num_input_tokens_seen": 19007360, "step": 18885 }, { "epoch": 8.906176331918907, "grad_norm": 0.12143045663833618, "learning_rate": 3.393595549516166e-05, "loss": 0.0833, "num_input_tokens_seen": 19012576, "step": 18890 }, { "epoch": 8.90853371051391, "grad_norm": 0.10366038978099823, "learning_rate": 3.3926348261056725e-05, "loss": 0.1114, "num_input_tokens_seen": 19017344, "step": 18895 }, { "epoch": 8.910891089108912, "grad_norm": 0.1477525532245636, "learning_rate": 3.391673951587259e-05, "loss": 0.0805, "num_input_tokens_seen": 19022016, "step": 18900 }, { "epoch": 8.913248467703912, "grad_norm": 1.2966822385787964, "learning_rate": 3.390712926123585e-05, "loss": 0.2116, "num_input_tokens_seen": 19026912, "step": 18905 }, { "epoch": 8.915605846298917, "grad_norm": 0.6171517968177795, "learning_rate": 3.3897517498773364e-05, "loss": 0.071, "num_input_tokens_seen": 19033184, "step": 18910 }, { "epoch": 8.917963224893917, "grad_norm": 0.14411896467208862, "learning_rate": 3.388790423011223e-05, "loss": 0.1264, "num_input_tokens_seen": 19037664, "step": 18915 }, { "epoch": 8.92032060348892, "grad_norm": 1.7048602104187012, "learning_rate": 3.387828945687982e-05, "loss": 0.1656, "num_input_tokens_seen": 19042784, "step": 18920 }, { "epoch": 8.922677982083922, "grad_norm": 0.9494151473045349, "learning_rate": 3.3868673180703755e-05, "loss": 0.1128, "num_input_tokens_seen": 19047424, "step": 18925 }, { "epoch": 8.925035360678924, "grad_norm": 0.5030516386032104, "learning_rate": 3.3859055403211894e-05, "loss": 0.151, "num_input_tokens_seen": 19052512, "step": 18930 }, { "epoch": 8.927392739273927, "grad_norm": 1.1335219144821167, "learning_rate": 3.384943612603238e-05, "loss": 0.2638, "num_input_tokens_seen": 19056992, "step": 18935 }, { "epoch": 8.92975011786893, "grad_norm": 0.6823767423629761, "learning_rate": 3.383981535079357e-05, "loss": 0.0534, "num_input_tokens_seen": 19061984, "step": 18940 }, { "epoch": 8.932107496463932, "grad_norm": 0.34639570116996765, "learning_rate": 3.383019307912413e-05, "loss": 0.1585, "num_input_tokens_seen": 19066592, "step": 18945 }, { "epoch": 8.934464875058934, "grad_norm": 1.9420075416564941, "learning_rate": 3.382056931265292e-05, "loss": 0.1463, "num_input_tokens_seen": 19072128, "step": 18950 }, { "epoch": 8.936822253653936, "grad_norm": 1.4051131010055542, "learning_rate": 3.381094405300909e-05, "loss": 0.1262, "num_input_tokens_seen": 19077056, "step": 18955 }, { "epoch": 8.939179632248939, "grad_norm": 0.6983122825622559, "learning_rate": 3.3801317301822035e-05, "loss": 0.059, "num_input_tokens_seen": 19082048, "step": 18960 }, { "epoch": 8.941537010843941, "grad_norm": 0.1679019182920456, "learning_rate": 3.37916890607214e-05, "loss": 0.0792, "num_input_tokens_seen": 19086912, "step": 18965 }, { "epoch": 8.943894389438944, "grad_norm": 0.18005579710006714, "learning_rate": 3.378205933133708e-05, "loss": 0.1016, "num_input_tokens_seen": 19090816, "step": 18970 }, { "epoch": 8.946251768033946, "grad_norm": 0.84787517786026, "learning_rate": 3.377242811529922e-05, "loss": 0.0484, "num_input_tokens_seen": 19096224, "step": 18975 }, { "epoch": 8.948609146628948, "grad_norm": 0.35200533270835876, "learning_rate": 3.376279541423824e-05, "loss": 0.1091, "num_input_tokens_seen": 19100352, "step": 18980 }, { "epoch": 8.95096652522395, "grad_norm": 1.1246156692504883, "learning_rate": 3.3753161229784766e-05, "loss": 0.0761, "num_input_tokens_seen": 19104480, "step": 18985 }, { "epoch": 8.953323903818953, "grad_norm": 0.7159964442253113, "learning_rate": 3.374352556356973e-05, "loss": 0.058, "num_input_tokens_seen": 19109152, "step": 18990 }, { "epoch": 8.955681282413956, "grad_norm": 0.9523252844810486, "learning_rate": 3.3733888417224265e-05, "loss": 0.1219, "num_input_tokens_seen": 19116512, "step": 18995 }, { "epoch": 8.958038661008958, "grad_norm": 0.8561285138130188, "learning_rate": 3.372424979237978e-05, "loss": 0.0814, "num_input_tokens_seen": 19121920, "step": 19000 }, { "epoch": 8.96039603960396, "grad_norm": 0.5603660345077515, "learning_rate": 3.371460969066794e-05, "loss": 0.0902, "num_input_tokens_seen": 19126144, "step": 19005 }, { "epoch": 8.962753418198963, "grad_norm": 1.8452301025390625, "learning_rate": 3.370496811372065e-05, "loss": 0.1744, "num_input_tokens_seen": 19131648, "step": 19010 }, { "epoch": 8.965110796793965, "grad_norm": 0.3804435133934021, "learning_rate": 3.3695325063170054e-05, "loss": 0.1578, "num_input_tokens_seen": 19136768, "step": 19015 }, { "epoch": 8.967468175388968, "grad_norm": 0.16827771067619324, "learning_rate": 3.368568054064856e-05, "loss": 0.2551, "num_input_tokens_seen": 19141792, "step": 19020 }, { "epoch": 8.96982555398397, "grad_norm": 0.07146807014942169, "learning_rate": 3.367603454778884e-05, "loss": 0.0477, "num_input_tokens_seen": 19146720, "step": 19025 }, { "epoch": 8.972182932578972, "grad_norm": 0.1863330602645874, "learning_rate": 3.366638708622377e-05, "loss": 0.2125, "num_input_tokens_seen": 19151776, "step": 19030 }, { "epoch": 8.974540311173975, "grad_norm": 0.08163100481033325, "learning_rate": 3.365673815758651e-05, "loss": 0.0225, "num_input_tokens_seen": 19157856, "step": 19035 }, { "epoch": 8.976897689768977, "grad_norm": 2.2843685150146484, "learning_rate": 3.364708776351047e-05, "loss": 0.1521, "num_input_tokens_seen": 19162752, "step": 19040 }, { "epoch": 8.97925506836398, "grad_norm": 1.2958859205245972, "learning_rate": 3.3637435905629284e-05, "loss": 0.1502, "num_input_tokens_seen": 19166944, "step": 19045 }, { "epoch": 8.981612446958982, "grad_norm": 1.806240200996399, "learning_rate": 3.362778258557686e-05, "loss": 0.1948, "num_input_tokens_seen": 19172064, "step": 19050 }, { "epoch": 8.983969825553984, "grad_norm": 1.2497504949569702, "learning_rate": 3.361812780498733e-05, "loss": 0.1272, "num_input_tokens_seen": 19177216, "step": 19055 }, { "epoch": 8.986327204148987, "grad_norm": 1.0948132276535034, "learning_rate": 3.3608471565495095e-05, "loss": 0.1482, "num_input_tokens_seen": 19182560, "step": 19060 }, { "epoch": 8.98868458274399, "grad_norm": 0.4280930161476135, "learning_rate": 3.359881386873479e-05, "loss": 0.1537, "num_input_tokens_seen": 19187520, "step": 19065 }, { "epoch": 8.991041961338992, "grad_norm": 0.17578262090682983, "learning_rate": 3.3589154716341294e-05, "loss": 0.1157, "num_input_tokens_seen": 19192160, "step": 19070 }, { "epoch": 8.993399339933994, "grad_norm": 1.4664549827575684, "learning_rate": 3.357949410994974e-05, "loss": 0.0819, "num_input_tokens_seen": 19196832, "step": 19075 }, { "epoch": 8.995756718528996, "grad_norm": 0.42477381229400635, "learning_rate": 3.35698320511955e-05, "loss": 0.0982, "num_input_tokens_seen": 19202272, "step": 19080 }, { "epoch": 8.998114097123999, "grad_norm": 1.4224674701690674, "learning_rate": 3.3560168541714206e-05, "loss": 0.3383, "num_input_tokens_seen": 19207616, "step": 19085 }, { "epoch": 9.0, "eval_loss": 0.15185481309890747, "eval_runtime": 15.0781, "eval_samples_per_second": 62.541, "eval_steps_per_second": 15.652, "num_input_tokens_seen": 19211360, "step": 19089 }, { "epoch": 9.000471475719001, "grad_norm": 0.9667254686355591, "learning_rate": 3.355050358314172e-05, "loss": 0.1032, "num_input_tokens_seen": 19212512, "step": 19090 }, { "epoch": 9.002828854314004, "grad_norm": 0.46077999472618103, "learning_rate": 3.354083717711416e-05, "loss": 0.0446, "num_input_tokens_seen": 19219552, "step": 19095 }, { "epoch": 9.005186232909006, "grad_norm": 0.040581513196229935, "learning_rate": 3.353116932526787e-05, "loss": 0.102, "num_input_tokens_seen": 19224320, "step": 19100 }, { "epoch": 9.007543611504008, "grad_norm": 0.904215931892395, "learning_rate": 3.352150002923947e-05, "loss": 0.0545, "num_input_tokens_seen": 19229536, "step": 19105 }, { "epoch": 9.009900990099009, "grad_norm": 2.0388734340667725, "learning_rate": 3.351182929066581e-05, "loss": 0.2174, "num_input_tokens_seen": 19233920, "step": 19110 }, { "epoch": 9.012258368694011, "grad_norm": 0.24460923671722412, "learning_rate": 3.3502157111183965e-05, "loss": 0.1436, "num_input_tokens_seen": 19238944, "step": 19115 }, { "epoch": 9.014615747289014, "grad_norm": 1.237303614616394, "learning_rate": 3.349248349243129e-05, "loss": 0.1677, "num_input_tokens_seen": 19243744, "step": 19120 }, { "epoch": 9.016973125884016, "grad_norm": 0.7532092928886414, "learning_rate": 3.3482808436045344e-05, "loss": 0.1553, "num_input_tokens_seen": 19248608, "step": 19125 }, { "epoch": 9.019330504479019, "grad_norm": 0.08952578157186508, "learning_rate": 3.347313194366396e-05, "loss": 0.0829, "num_input_tokens_seen": 19253152, "step": 19130 }, { "epoch": 9.021687883074021, "grad_norm": 0.32387447357177734, "learning_rate": 3.346345401692521e-05, "loss": 0.2017, "num_input_tokens_seen": 19258112, "step": 19135 }, { "epoch": 9.024045261669023, "grad_norm": 0.9280523657798767, "learning_rate": 3.34537746574674e-05, "loss": 0.0674, "num_input_tokens_seen": 19262624, "step": 19140 }, { "epoch": 9.026402640264026, "grad_norm": 0.6234618425369263, "learning_rate": 3.344409386692906e-05, "loss": 0.1199, "num_input_tokens_seen": 19267360, "step": 19145 }, { "epoch": 9.028760018859028, "grad_norm": 0.1795564442873001, "learning_rate": 3.343441164694901e-05, "loss": 0.2416, "num_input_tokens_seen": 19272608, "step": 19150 }, { "epoch": 9.03111739745403, "grad_norm": 1.4252889156341553, "learning_rate": 3.342472799916628e-05, "loss": 0.0867, "num_input_tokens_seen": 19276448, "step": 19155 }, { "epoch": 9.033474776049033, "grad_norm": 0.10837731510400772, "learning_rate": 3.341504292522014e-05, "loss": 0.165, "num_input_tokens_seen": 19280832, "step": 19160 }, { "epoch": 9.035832154644035, "grad_norm": 0.20777937769889832, "learning_rate": 3.34053564267501e-05, "loss": 0.1047, "num_input_tokens_seen": 19285824, "step": 19165 }, { "epoch": 9.038189533239038, "grad_norm": 0.07830439507961273, "learning_rate": 3.339566850539595e-05, "loss": 0.0832, "num_input_tokens_seen": 19291424, "step": 19170 }, { "epoch": 9.04054691183404, "grad_norm": 1.336593508720398, "learning_rate": 3.338597916279767e-05, "loss": 0.0689, "num_input_tokens_seen": 19297024, "step": 19175 }, { "epoch": 9.042904290429043, "grad_norm": 0.5711853504180908, "learning_rate": 3.33762884005955e-05, "loss": 0.0536, "num_input_tokens_seen": 19301440, "step": 19180 }, { "epoch": 9.045261669024045, "grad_norm": 0.17127063870429993, "learning_rate": 3.3366596220429925e-05, "loss": 0.1484, "num_input_tokens_seen": 19306848, "step": 19185 }, { "epoch": 9.047619047619047, "grad_norm": 1.1280308961868286, "learning_rate": 3.335690262394166e-05, "loss": 0.0974, "num_input_tokens_seen": 19312992, "step": 19190 }, { "epoch": 9.04997642621405, "grad_norm": 1.1820439100265503, "learning_rate": 3.3347207612771685e-05, "loss": 0.4488, "num_input_tokens_seen": 19320544, "step": 19195 }, { "epoch": 9.052333804809052, "grad_norm": 0.1938328891992569, "learning_rate": 3.333751118856118e-05, "loss": 0.1104, "num_input_tokens_seen": 19325088, "step": 19200 }, { "epoch": 9.054691183404055, "grad_norm": 0.23264102637767792, "learning_rate": 3.33278133529516e-05, "loss": 0.0901, "num_input_tokens_seen": 19328768, "step": 19205 }, { "epoch": 9.057048561999057, "grad_norm": 0.8201847076416016, "learning_rate": 3.331811410758461e-05, "loss": 0.1215, "num_input_tokens_seen": 19334144, "step": 19210 }, { "epoch": 9.05940594059406, "grad_norm": 0.9384645819664001, "learning_rate": 3.330841345410214e-05, "loss": 0.1003, "num_input_tokens_seen": 19339552, "step": 19215 }, { "epoch": 9.061763319189062, "grad_norm": 0.17311185598373413, "learning_rate": 3.329871139414634e-05, "loss": 0.0361, "num_input_tokens_seen": 19344928, "step": 19220 }, { "epoch": 9.064120697784064, "grad_norm": 0.0674392580986023, "learning_rate": 3.328900792935961e-05, "loss": 0.1945, "num_input_tokens_seen": 19349856, "step": 19225 }, { "epoch": 9.066478076379067, "grad_norm": 0.10324382036924362, "learning_rate": 3.3279303061384566e-05, "loss": 0.1124, "num_input_tokens_seen": 19354368, "step": 19230 }, { "epoch": 9.068835454974069, "grad_norm": 0.05472348630428314, "learning_rate": 3.32695967918641e-05, "loss": 0.0614, "num_input_tokens_seen": 19360096, "step": 19235 }, { "epoch": 9.071192833569071, "grad_norm": 1.0603303909301758, "learning_rate": 3.32598891224413e-05, "loss": 0.0564, "num_input_tokens_seen": 19364544, "step": 19240 }, { "epoch": 9.073550212164074, "grad_norm": 0.3689463138580322, "learning_rate": 3.3250180054759515e-05, "loss": 0.1846, "num_input_tokens_seen": 19369728, "step": 19245 }, { "epoch": 9.075907590759076, "grad_norm": 0.6209053993225098, "learning_rate": 3.3240469590462334e-05, "loss": 0.1145, "num_input_tokens_seen": 19375392, "step": 19250 }, { "epoch": 9.078264969354079, "grad_norm": 0.08267923444509506, "learning_rate": 3.3230757731193564e-05, "loss": 0.0828, "num_input_tokens_seen": 19380704, "step": 19255 }, { "epoch": 9.080622347949081, "grad_norm": 0.22085878252983093, "learning_rate": 3.322104447859726e-05, "loss": 0.035, "num_input_tokens_seen": 19384992, "step": 19260 }, { "epoch": 9.082979726544083, "grad_norm": 0.6793509721755981, "learning_rate": 3.3211329834317706e-05, "loss": 0.125, "num_input_tokens_seen": 19390112, "step": 19265 }, { "epoch": 9.085337105139086, "grad_norm": 0.430453360080719, "learning_rate": 3.320161379999943e-05, "loss": 0.1095, "num_input_tokens_seen": 19395360, "step": 19270 }, { "epoch": 9.087694483734088, "grad_norm": 2.0338938236236572, "learning_rate": 3.31918963772872e-05, "loss": 0.1161, "num_input_tokens_seen": 19399712, "step": 19275 }, { "epoch": 9.09005186232909, "grad_norm": 0.2747354209423065, "learning_rate": 3.3182177567826005e-05, "loss": 0.1051, "num_input_tokens_seen": 19404960, "step": 19280 }, { "epoch": 9.092409240924093, "grad_norm": 0.6290506720542908, "learning_rate": 3.317245737326107e-05, "loss": 0.1708, "num_input_tokens_seen": 19410944, "step": 19285 }, { "epoch": 9.094766619519095, "grad_norm": 0.06229935958981514, "learning_rate": 3.316273579523787e-05, "loss": 0.0444, "num_input_tokens_seen": 19416480, "step": 19290 }, { "epoch": 9.097123998114098, "grad_norm": 1.6117258071899414, "learning_rate": 3.315301283540208e-05, "loss": 0.2501, "num_input_tokens_seen": 19420960, "step": 19295 }, { "epoch": 9.0994813767091, "grad_norm": 2.041572093963623, "learning_rate": 3.314328849539966e-05, "loss": 0.329, "num_input_tokens_seen": 19426112, "step": 19300 }, { "epoch": 9.101838755304103, "grad_norm": 1.3729416131973267, "learning_rate": 3.313356277687675e-05, "loss": 0.1617, "num_input_tokens_seen": 19430592, "step": 19305 }, { "epoch": 9.104196133899103, "grad_norm": 0.508098304271698, "learning_rate": 3.3123835681479774e-05, "loss": 0.1385, "num_input_tokens_seen": 19435168, "step": 19310 }, { "epoch": 9.106553512494106, "grad_norm": 0.27975744009017944, "learning_rate": 3.311410721085534e-05, "loss": 0.0435, "num_input_tokens_seen": 19440672, "step": 19315 }, { "epoch": 9.108910891089108, "grad_norm": 0.44295310974121094, "learning_rate": 3.310437736665034e-05, "loss": 0.081, "num_input_tokens_seen": 19444992, "step": 19320 }, { "epoch": 9.11126826968411, "grad_norm": 0.45837321877479553, "learning_rate": 3.309464615051185e-05, "loss": 0.092, "num_input_tokens_seen": 19451040, "step": 19325 }, { "epoch": 9.113625648279113, "grad_norm": 0.929783821105957, "learning_rate": 3.30849135640872e-05, "loss": 0.2135, "num_input_tokens_seen": 19455328, "step": 19330 }, { "epoch": 9.115983026874115, "grad_norm": 0.5445961952209473, "learning_rate": 3.307517960902397e-05, "loss": 0.0354, "num_input_tokens_seen": 19462176, "step": 19335 }, { "epoch": 9.118340405469118, "grad_norm": 0.6753615736961365, "learning_rate": 3.3065444286969935e-05, "loss": 0.0327, "num_input_tokens_seen": 19467360, "step": 19340 }, { "epoch": 9.12069778406412, "grad_norm": 0.9017568826675415, "learning_rate": 3.305570759957312e-05, "loss": 0.0959, "num_input_tokens_seen": 19473376, "step": 19345 }, { "epoch": 9.123055162659123, "grad_norm": 0.07546696811914444, "learning_rate": 3.304596954848179e-05, "loss": 0.0319, "num_input_tokens_seen": 19477408, "step": 19350 }, { "epoch": 9.125412541254125, "grad_norm": 0.5531951189041138, "learning_rate": 3.303623013534442e-05, "loss": 0.3211, "num_input_tokens_seen": 19482656, "step": 19355 }, { "epoch": 9.127769919849127, "grad_norm": 0.030428476631641388, "learning_rate": 3.302648936180977e-05, "loss": 0.2037, "num_input_tokens_seen": 19488416, "step": 19360 }, { "epoch": 9.13012729844413, "grad_norm": 0.22907090187072754, "learning_rate": 3.301674722952672e-05, "loss": 0.0531, "num_input_tokens_seen": 19494144, "step": 19365 }, { "epoch": 9.132484677039132, "grad_norm": 0.09226822108030319, "learning_rate": 3.300700374014448e-05, "loss": 0.0729, "num_input_tokens_seen": 19498944, "step": 19370 }, { "epoch": 9.134842055634135, "grad_norm": 0.14942701160907745, "learning_rate": 3.299725889531247e-05, "loss": 0.0355, "num_input_tokens_seen": 19504448, "step": 19375 }, { "epoch": 9.137199434229137, "grad_norm": 0.3492525517940521, "learning_rate": 3.298751269668031e-05, "loss": 0.0922, "num_input_tokens_seen": 19510176, "step": 19380 }, { "epoch": 9.13955681282414, "grad_norm": 0.3463575839996338, "learning_rate": 3.297776514589787e-05, "loss": 0.1096, "num_input_tokens_seen": 19514688, "step": 19385 }, { "epoch": 9.141914191419142, "grad_norm": 0.36425545811653137, "learning_rate": 3.296801624461525e-05, "loss": 0.074, "num_input_tokens_seen": 19519328, "step": 19390 }, { "epoch": 9.144271570014144, "grad_norm": 0.15450894832611084, "learning_rate": 3.2958265994482766e-05, "loss": 0.0489, "num_input_tokens_seen": 19524992, "step": 19395 }, { "epoch": 9.146628948609147, "grad_norm": 0.6671811938285828, "learning_rate": 3.294851439715098e-05, "loss": 0.0381, "num_input_tokens_seen": 19530752, "step": 19400 }, { "epoch": 9.148986327204149, "grad_norm": 0.11589448153972626, "learning_rate": 3.293876145427065e-05, "loss": 0.0952, "num_input_tokens_seen": 19536704, "step": 19405 }, { "epoch": 9.151343705799151, "grad_norm": 0.22316324710845947, "learning_rate": 3.292900716749281e-05, "loss": 0.0243, "num_input_tokens_seen": 19540352, "step": 19410 }, { "epoch": 9.153701084394154, "grad_norm": 1.166772484779358, "learning_rate": 3.291925153846868e-05, "loss": 0.2405, "num_input_tokens_seen": 19544640, "step": 19415 }, { "epoch": 9.156058462989156, "grad_norm": 0.29022017121315, "learning_rate": 3.290949456884973e-05, "loss": 0.1397, "num_input_tokens_seen": 19549216, "step": 19420 }, { "epoch": 9.158415841584159, "grad_norm": 0.547635555267334, "learning_rate": 3.289973626028763e-05, "loss": 0.1709, "num_input_tokens_seen": 19553408, "step": 19425 }, { "epoch": 9.160773220179161, "grad_norm": 0.40545937418937683, "learning_rate": 3.2889976614434325e-05, "loss": 0.2005, "num_input_tokens_seen": 19558752, "step": 19430 }, { "epoch": 9.163130598774163, "grad_norm": 0.6997222304344177, "learning_rate": 3.288021563294193e-05, "loss": 0.2622, "num_input_tokens_seen": 19563840, "step": 19435 }, { "epoch": 9.165487977369166, "grad_norm": 0.015208248980343342, "learning_rate": 3.287045331746285e-05, "loss": 0.2158, "num_input_tokens_seen": 19568576, "step": 19440 }, { "epoch": 9.167845355964168, "grad_norm": 0.7064257264137268, "learning_rate": 3.286068966964964e-05, "loss": 0.175, "num_input_tokens_seen": 19573504, "step": 19445 }, { "epoch": 9.17020273455917, "grad_norm": 0.03999810293316841, "learning_rate": 3.285092469115514e-05, "loss": 0.1807, "num_input_tokens_seen": 19578176, "step": 19450 }, { "epoch": 9.172560113154173, "grad_norm": 0.024562133476138115, "learning_rate": 3.284115838363239e-05, "loss": 0.1412, "num_input_tokens_seen": 19582432, "step": 19455 }, { "epoch": 9.174917491749175, "grad_norm": 0.3413994610309601, "learning_rate": 3.2831390748734657e-05, "loss": 0.1362, "num_input_tokens_seen": 19587936, "step": 19460 }, { "epoch": 9.177274870344178, "grad_norm": 1.1490658521652222, "learning_rate": 3.2821621788115445e-05, "loss": 0.2161, "num_input_tokens_seen": 19592704, "step": 19465 }, { "epoch": 9.17963224893918, "grad_norm": 1.2901191711425781, "learning_rate": 3.281185150342846e-05, "loss": 0.2406, "num_input_tokens_seen": 19597440, "step": 19470 }, { "epoch": 9.181989627534183, "grad_norm": 1.8444077968597412, "learning_rate": 3.280207989632767e-05, "loss": 0.2278, "num_input_tokens_seen": 19603488, "step": 19475 }, { "epoch": 9.184347006129185, "grad_norm": 0.7344078421592712, "learning_rate": 3.2792306968467207e-05, "loss": 0.1422, "num_input_tokens_seen": 19609088, "step": 19480 }, { "epoch": 9.186704384724187, "grad_norm": 0.5279793739318848, "learning_rate": 3.278253272150149e-05, "loss": 0.0462, "num_input_tokens_seen": 19613856, "step": 19485 }, { "epoch": 9.18906176331919, "grad_norm": 1.237180471420288, "learning_rate": 3.277275715708512e-05, "loss": 0.102, "num_input_tokens_seen": 19619648, "step": 19490 }, { "epoch": 9.191419141914192, "grad_norm": 0.726967453956604, "learning_rate": 3.2762980276872936e-05, "loss": 0.0789, "num_input_tokens_seen": 19624320, "step": 19495 }, { "epoch": 9.193776520509195, "grad_norm": 0.9716647267341614, "learning_rate": 3.275320208252e-05, "loss": 0.0431, "num_input_tokens_seen": 19628832, "step": 19500 }, { "epoch": 9.196133899104197, "grad_norm": 0.07955606281757355, "learning_rate": 3.27434225756816e-05, "loss": 0.0907, "num_input_tokens_seen": 19634464, "step": 19505 }, { "epoch": 9.198491277699198, "grad_norm": 0.09777360409498215, "learning_rate": 3.273364175801322e-05, "loss": 0.0229, "num_input_tokens_seen": 19640384, "step": 19510 }, { "epoch": 9.2008486562942, "grad_norm": 0.18338656425476074, "learning_rate": 3.2723859631170614e-05, "loss": 0.041, "num_input_tokens_seen": 19644864, "step": 19515 }, { "epoch": 9.203206034889202, "grad_norm": 0.2108597755432129, "learning_rate": 3.2714076196809705e-05, "loss": 0.1109, "num_input_tokens_seen": 19650784, "step": 19520 }, { "epoch": 9.205563413484205, "grad_norm": 0.05548704415559769, "learning_rate": 3.2704291456586686e-05, "loss": 0.05, "num_input_tokens_seen": 19656032, "step": 19525 }, { "epoch": 9.207920792079207, "grad_norm": 0.048527203500270844, "learning_rate": 3.269450541215792e-05, "loss": 0.0723, "num_input_tokens_seen": 19661120, "step": 19530 }, { "epoch": 9.21027817067421, "grad_norm": 0.04767385497689247, "learning_rate": 3.2684718065180044e-05, "loss": 0.0669, "num_input_tokens_seen": 19665792, "step": 19535 }, { "epoch": 9.212635549269212, "grad_norm": 0.3825305104255676, "learning_rate": 3.267492941730988e-05, "loss": 0.0779, "num_input_tokens_seen": 19671712, "step": 19540 }, { "epoch": 9.214992927864214, "grad_norm": 0.2957548499107361, "learning_rate": 3.266513947020447e-05, "loss": 0.1058, "num_input_tokens_seen": 19677120, "step": 19545 }, { "epoch": 9.217350306459217, "grad_norm": 0.13122901320457458, "learning_rate": 3.2655348225521096e-05, "loss": 0.1505, "num_input_tokens_seen": 19682112, "step": 19550 }, { "epoch": 9.21970768505422, "grad_norm": 1.9333276748657227, "learning_rate": 3.2645555684917246e-05, "loss": 0.1224, "num_input_tokens_seen": 19688288, "step": 19555 }, { "epoch": 9.222065063649222, "grad_norm": 2.1417195796966553, "learning_rate": 3.263576185005063e-05, "loss": 0.1248, "num_input_tokens_seen": 19693216, "step": 19560 }, { "epoch": 9.224422442244224, "grad_norm": 1.380716323852539, "learning_rate": 3.262596672257919e-05, "loss": 0.0444, "num_input_tokens_seen": 19697504, "step": 19565 }, { "epoch": 9.226779820839226, "grad_norm": 0.3284957706928253, "learning_rate": 3.261617030416107e-05, "loss": 0.0973, "num_input_tokens_seen": 19702560, "step": 19570 }, { "epoch": 9.229137199434229, "grad_norm": 0.09702058136463165, "learning_rate": 3.260637259645462e-05, "loss": 0.0192, "num_input_tokens_seen": 19706304, "step": 19575 }, { "epoch": 9.231494578029231, "grad_norm": 2.017909526824951, "learning_rate": 3.2596573601118444e-05, "loss": 0.1414, "num_input_tokens_seen": 19713216, "step": 19580 }, { "epoch": 9.233851956624234, "grad_norm": 1.13421630859375, "learning_rate": 3.258677331981134e-05, "loss": 0.1357, "num_input_tokens_seen": 19718592, "step": 19585 }, { "epoch": 9.236209335219236, "grad_norm": 1.9667149782180786, "learning_rate": 3.257697175419233e-05, "loss": 0.3251, "num_input_tokens_seen": 19725472, "step": 19590 }, { "epoch": 9.238566713814238, "grad_norm": 0.28481319546699524, "learning_rate": 3.256716890592065e-05, "loss": 0.071, "num_input_tokens_seen": 19729056, "step": 19595 }, { "epoch": 9.24092409240924, "grad_norm": 1.2026910781860352, "learning_rate": 3.255736477665576e-05, "loss": 0.083, "num_input_tokens_seen": 19734816, "step": 19600 }, { "epoch": 9.243281471004243, "grad_norm": 1.239881157875061, "learning_rate": 3.2547559368057325e-05, "loss": 0.2715, "num_input_tokens_seen": 19739584, "step": 19605 }, { "epoch": 9.245638849599246, "grad_norm": 0.05452215299010277, "learning_rate": 3.253775268178524e-05, "loss": 0.0609, "num_input_tokens_seen": 19744416, "step": 19610 }, { "epoch": 9.247996228194248, "grad_norm": 1.0294424295425415, "learning_rate": 3.252794471949961e-05, "loss": 0.1477, "num_input_tokens_seen": 19749184, "step": 19615 }, { "epoch": 9.25035360678925, "grad_norm": 1.1053460836410522, "learning_rate": 3.251813548286076e-05, "loss": 0.1388, "num_input_tokens_seen": 19753888, "step": 19620 }, { "epoch": 9.252710985384253, "grad_norm": 2.102388381958008, "learning_rate": 3.250832497352922e-05, "loss": 0.2659, "num_input_tokens_seen": 19758368, "step": 19625 }, { "epoch": 9.255068363979255, "grad_norm": 1.8135311603546143, "learning_rate": 3.249851319316575e-05, "loss": 0.1459, "num_input_tokens_seen": 19762432, "step": 19630 }, { "epoch": 9.257425742574258, "grad_norm": 0.25374653935432434, "learning_rate": 3.2488700143431314e-05, "loss": 0.0538, "num_input_tokens_seen": 19766784, "step": 19635 }, { "epoch": 9.25978312116926, "grad_norm": 0.11293213069438934, "learning_rate": 3.247888582598709e-05, "loss": 0.0824, "num_input_tokens_seen": 19771264, "step": 19640 }, { "epoch": 9.262140499764262, "grad_norm": 1.6045514345169067, "learning_rate": 3.2469070242494476e-05, "loss": 0.2229, "num_input_tokens_seen": 19776320, "step": 19645 }, { "epoch": 9.264497878359265, "grad_norm": 2.2929646968841553, "learning_rate": 3.2459253394615094e-05, "loss": 0.2367, "num_input_tokens_seen": 19781696, "step": 19650 }, { "epoch": 9.266855256954267, "grad_norm": 0.09757190197706223, "learning_rate": 3.244943528401077e-05, "loss": 0.0838, "num_input_tokens_seen": 19786400, "step": 19655 }, { "epoch": 9.26921263554927, "grad_norm": 0.13987210392951965, "learning_rate": 3.2439615912343526e-05, "loss": 0.1309, "num_input_tokens_seen": 19791072, "step": 19660 }, { "epoch": 9.271570014144272, "grad_norm": 0.33886396884918213, "learning_rate": 3.242979528127562e-05, "loss": 0.0628, "num_input_tokens_seen": 19796032, "step": 19665 }, { "epoch": 9.273927392739274, "grad_norm": 0.12372129410505295, "learning_rate": 3.2419973392469535e-05, "loss": 0.0525, "num_input_tokens_seen": 19801024, "step": 19670 }, { "epoch": 9.276284771334277, "grad_norm": 2.448765277862549, "learning_rate": 3.241015024758794e-05, "loss": 0.1603, "num_input_tokens_seen": 19805696, "step": 19675 }, { "epoch": 9.27864214992928, "grad_norm": 0.5962498784065247, "learning_rate": 3.2400325848293716e-05, "loss": 0.0715, "num_input_tokens_seen": 19810144, "step": 19680 }, { "epoch": 9.280999528524282, "grad_norm": 0.654751181602478, "learning_rate": 3.2390500196249986e-05, "loss": 0.0319, "num_input_tokens_seen": 19814848, "step": 19685 }, { "epoch": 9.283356907119284, "grad_norm": 0.5868538022041321, "learning_rate": 3.2380673293120054e-05, "loss": 0.0584, "num_input_tokens_seen": 19820384, "step": 19690 }, { "epoch": 9.285714285714286, "grad_norm": 0.21720556914806366, "learning_rate": 3.2370845140567443e-05, "loss": 0.1157, "num_input_tokens_seen": 19824960, "step": 19695 }, { "epoch": 9.288071664309289, "grad_norm": 1.6061183214187622, "learning_rate": 3.2361015740255905e-05, "loss": 0.2238, "num_input_tokens_seen": 19830720, "step": 19700 }, { "epoch": 9.290429042904291, "grad_norm": 0.4360312223434448, "learning_rate": 3.235118509384939e-05, "loss": 0.125, "num_input_tokens_seen": 19836480, "step": 19705 }, { "epoch": 9.292786421499294, "grad_norm": 0.6296364665031433, "learning_rate": 3.2341353203012045e-05, "loss": 0.085, "num_input_tokens_seen": 19841184, "step": 19710 }, { "epoch": 9.295143800094294, "grad_norm": 0.8810283541679382, "learning_rate": 3.233152006940826e-05, "loss": 0.1747, "num_input_tokens_seen": 19845984, "step": 19715 }, { "epoch": 9.297501178689297, "grad_norm": 0.045846208930015564, "learning_rate": 3.23216856947026e-05, "loss": 0.2419, "num_input_tokens_seen": 19850304, "step": 19720 }, { "epoch": 9.299858557284299, "grad_norm": 3.2364509105682373, "learning_rate": 3.2311850080559875e-05, "loss": 0.2432, "num_input_tokens_seen": 19855552, "step": 19725 }, { "epoch": 9.302215935879302, "grad_norm": 0.19219952821731567, "learning_rate": 3.230201322864507e-05, "loss": 0.2157, "num_input_tokens_seen": 19862176, "step": 19730 }, { "epoch": 9.304573314474304, "grad_norm": 0.3698905408382416, "learning_rate": 3.229217514062342e-05, "loss": 0.0555, "num_input_tokens_seen": 19867008, "step": 19735 }, { "epoch": 9.306930693069306, "grad_norm": 0.8398856520652771, "learning_rate": 3.228233581816033e-05, "loss": 0.1923, "num_input_tokens_seen": 19872640, "step": 19740 }, { "epoch": 9.309288071664309, "grad_norm": 0.29845094680786133, "learning_rate": 3.227249526292142e-05, "loss": 0.2343, "num_input_tokens_seen": 19878656, "step": 19745 }, { "epoch": 9.311645450259311, "grad_norm": 0.03156471997499466, "learning_rate": 3.2262653476572554e-05, "loss": 0.093, "num_input_tokens_seen": 19882624, "step": 19750 }, { "epoch": 9.314002828854314, "grad_norm": 0.7438340783119202, "learning_rate": 3.225281046077976e-05, "loss": 0.2056, "num_input_tokens_seen": 19888096, "step": 19755 }, { "epoch": 9.316360207449316, "grad_norm": 0.33702796697616577, "learning_rate": 3.224296621720931e-05, "loss": 0.0617, "num_input_tokens_seen": 19892480, "step": 19760 }, { "epoch": 9.318717586044318, "grad_norm": 0.3701706528663635, "learning_rate": 3.223312074752765e-05, "loss": 0.1072, "num_input_tokens_seen": 19898272, "step": 19765 }, { "epoch": 9.32107496463932, "grad_norm": 0.38950520753860474, "learning_rate": 3.2223274053401466e-05, "loss": 0.1086, "num_input_tokens_seen": 19903168, "step": 19770 }, { "epoch": 9.323432343234323, "grad_norm": 0.7031295299530029, "learning_rate": 3.221342613649763e-05, "loss": 0.2417, "num_input_tokens_seen": 19908032, "step": 19775 }, { "epoch": 9.325789721829326, "grad_norm": 1.832255244255066, "learning_rate": 3.2203576998483216e-05, "loss": 0.1627, "num_input_tokens_seen": 19912160, "step": 19780 }, { "epoch": 9.328147100424328, "grad_norm": 1.4314740896224976, "learning_rate": 3.219372664102552e-05, "loss": 0.1085, "num_input_tokens_seen": 19916640, "step": 19785 }, { "epoch": 9.33050447901933, "grad_norm": 0.6315851807594299, "learning_rate": 3.2183875065792065e-05, "loss": 0.1413, "num_input_tokens_seen": 19921824, "step": 19790 }, { "epoch": 9.332861857614333, "grad_norm": 0.15135706961154938, "learning_rate": 3.2174022274450534e-05, "loss": 0.0806, "num_input_tokens_seen": 19927104, "step": 19795 }, { "epoch": 9.335219236209335, "grad_norm": 0.8087823390960693, "learning_rate": 3.216416826866883e-05, "loss": 0.1001, "num_input_tokens_seen": 19932800, "step": 19800 }, { "epoch": 9.337576614804338, "grad_norm": 0.20103959739208221, "learning_rate": 3.2154313050115084e-05, "loss": 0.1531, "num_input_tokens_seen": 19937696, "step": 19805 }, { "epoch": 9.33993399339934, "grad_norm": 0.04870222881436348, "learning_rate": 3.214445662045761e-05, "loss": 0.1684, "num_input_tokens_seen": 19941472, "step": 19810 }, { "epoch": 9.342291371994342, "grad_norm": 0.2328980565071106, "learning_rate": 3.2134598981364936e-05, "loss": 0.0946, "num_input_tokens_seen": 19946336, "step": 19815 }, { "epoch": 9.344648750589345, "grad_norm": 0.4648456275463104, "learning_rate": 3.21247401345058e-05, "loss": 0.1554, "num_input_tokens_seen": 19951584, "step": 19820 }, { "epoch": 9.347006129184347, "grad_norm": 0.763016939163208, "learning_rate": 3.2114880081549125e-05, "loss": 0.0778, "num_input_tokens_seen": 19956192, "step": 19825 }, { "epoch": 9.34936350777935, "grad_norm": 1.6945873498916626, "learning_rate": 3.2105018824164055e-05, "loss": 0.0936, "num_input_tokens_seen": 19961472, "step": 19830 }, { "epoch": 9.351720886374352, "grad_norm": 0.5634740591049194, "learning_rate": 3.209515636401993e-05, "loss": 0.2639, "num_input_tokens_seen": 19966016, "step": 19835 }, { "epoch": 9.354078264969354, "grad_norm": 1.5984715223312378, "learning_rate": 3.2085292702786313e-05, "loss": 0.1984, "num_input_tokens_seen": 19970560, "step": 19840 }, { "epoch": 9.356435643564357, "grad_norm": 1.7816808223724365, "learning_rate": 3.207542784213293e-05, "loss": 0.1441, "num_input_tokens_seen": 19974592, "step": 19845 }, { "epoch": 9.35879302215936, "grad_norm": 0.21326844394207, "learning_rate": 3.206556178372976e-05, "loss": 0.193, "num_input_tokens_seen": 19979712, "step": 19850 }, { "epoch": 9.361150400754362, "grad_norm": 0.6102802753448486, "learning_rate": 3.205569452924693e-05, "loss": 0.2008, "num_input_tokens_seen": 19984416, "step": 19855 }, { "epoch": 9.363507779349364, "grad_norm": 0.7349022030830383, "learning_rate": 3.204582608035483e-05, "loss": 0.1592, "num_input_tokens_seen": 19988576, "step": 19860 }, { "epoch": 9.365865157944366, "grad_norm": 0.2829272150993347, "learning_rate": 3.2035956438724004e-05, "loss": 0.0446, "num_input_tokens_seen": 19993696, "step": 19865 }, { "epoch": 9.368222536539369, "grad_norm": 0.2966292202472687, "learning_rate": 3.202608560602521e-05, "loss": 0.0288, "num_input_tokens_seen": 19999872, "step": 19870 }, { "epoch": 9.370579915134371, "grad_norm": 2.251495838165283, "learning_rate": 3.201621358392942e-05, "loss": 0.1252, "num_input_tokens_seen": 20004896, "step": 19875 }, { "epoch": 9.372937293729374, "grad_norm": 0.8625372052192688, "learning_rate": 3.200634037410781e-05, "loss": 0.0631, "num_input_tokens_seen": 20009088, "step": 19880 }, { "epoch": 9.375294672324376, "grad_norm": 2.186121702194214, "learning_rate": 3.1996465978231726e-05, "loss": 0.2882, "num_input_tokens_seen": 20013856, "step": 19885 }, { "epoch": 9.377652050919378, "grad_norm": 0.2265891581773758, "learning_rate": 3.198659039797274e-05, "loss": 0.0885, "num_input_tokens_seen": 20018400, "step": 19890 }, { "epoch": 9.38000942951438, "grad_norm": 0.7243408560752869, "learning_rate": 3.1976713635002634e-05, "loss": 0.1097, "num_input_tokens_seen": 20023264, "step": 19895 }, { "epoch": 9.382366808109383, "grad_norm": 0.31719857454299927, "learning_rate": 3.196683569099337e-05, "loss": 0.0321, "num_input_tokens_seen": 20029696, "step": 19900 }, { "epoch": 9.384724186704386, "grad_norm": 1.723666787147522, "learning_rate": 3.1956956567617106e-05, "loss": 0.1352, "num_input_tokens_seen": 20033952, "step": 19905 }, { "epoch": 9.387081565299386, "grad_norm": 0.23423658311367035, "learning_rate": 3.194707626654622e-05, "loss": 0.1029, "num_input_tokens_seen": 20039104, "step": 19910 }, { "epoch": 9.389438943894389, "grad_norm": 0.2584892809391022, "learning_rate": 3.1937194789453274e-05, "loss": 0.132, "num_input_tokens_seen": 20043296, "step": 19915 }, { "epoch": 9.391796322489391, "grad_norm": 2.205338954925537, "learning_rate": 3.1927312138011043e-05, "loss": 0.1738, "num_input_tokens_seen": 20048128, "step": 19920 }, { "epoch": 9.394153701084393, "grad_norm": 0.07707193493843079, "learning_rate": 3.1917428313892484e-05, "loss": 0.1569, "num_input_tokens_seen": 20053664, "step": 19925 }, { "epoch": 9.396511079679396, "grad_norm": 1.6547833681106567, "learning_rate": 3.190754331877076e-05, "loss": 0.1544, "num_input_tokens_seen": 20060064, "step": 19930 }, { "epoch": 9.398868458274398, "grad_norm": 0.5165858268737793, "learning_rate": 3.189765715431924e-05, "loss": 0.0682, "num_input_tokens_seen": 20064800, "step": 19935 }, { "epoch": 9.4012258368694, "grad_norm": 0.2879968285560608, "learning_rate": 3.188776982221147e-05, "loss": 0.1742, "num_input_tokens_seen": 20070496, "step": 19940 }, { "epoch": 9.403583215464403, "grad_norm": 0.050670724362134933, "learning_rate": 3.1877881324121226e-05, "loss": 0.059, "num_input_tokens_seen": 20075840, "step": 19945 }, { "epoch": 9.405940594059405, "grad_norm": 0.216670960187912, "learning_rate": 3.186799166172245e-05, "loss": 0.1739, "num_input_tokens_seen": 20082816, "step": 19950 }, { "epoch": 9.408297972654408, "grad_norm": 0.7493588924407959, "learning_rate": 3.18581008366893e-05, "loss": 0.1067, "num_input_tokens_seen": 20087712, "step": 19955 }, { "epoch": 9.41065535124941, "grad_norm": 2.0678153038024902, "learning_rate": 3.184820885069612e-05, "loss": 0.1164, "num_input_tokens_seen": 20092256, "step": 19960 }, { "epoch": 9.413012729844413, "grad_norm": 0.3342495560646057, "learning_rate": 3.183831570541746e-05, "loss": 0.0614, "num_input_tokens_seen": 20097408, "step": 19965 }, { "epoch": 9.415370108439415, "grad_norm": 0.49231526255607605, "learning_rate": 3.182842140252806e-05, "loss": 0.2195, "num_input_tokens_seen": 20102048, "step": 19970 }, { "epoch": 9.417727487034417, "grad_norm": 1.1810877323150635, "learning_rate": 3.181852594370285e-05, "loss": 0.0828, "num_input_tokens_seen": 20107200, "step": 19975 }, { "epoch": 9.42008486562942, "grad_norm": 1.8288791179656982, "learning_rate": 3.1808629330616975e-05, "loss": 0.1436, "num_input_tokens_seen": 20111392, "step": 19980 }, { "epoch": 9.422442244224422, "grad_norm": 1.2210102081298828, "learning_rate": 3.1798731564945764e-05, "loss": 0.1003, "num_input_tokens_seen": 20116000, "step": 19985 }, { "epoch": 9.424799622819425, "grad_norm": 0.05216154828667641, "learning_rate": 3.178883264836472e-05, "loss": 0.0256, "num_input_tokens_seen": 20122432, "step": 19990 }, { "epoch": 9.427157001414427, "grad_norm": 0.09998687356710434, "learning_rate": 3.177893258254958e-05, "loss": 0.1223, "num_input_tokens_seen": 20126432, "step": 19995 }, { "epoch": 9.42951438000943, "grad_norm": 1.2201008796691895, "learning_rate": 3.176903136917626e-05, "loss": 0.2625, "num_input_tokens_seen": 20131648, "step": 20000 }, { "epoch": 9.431871758604432, "grad_norm": 1.525105357170105, "learning_rate": 3.175912900992086e-05, "loss": 0.208, "num_input_tokens_seen": 20135968, "step": 20005 }, { "epoch": 9.434229137199434, "grad_norm": 1.298371434211731, "learning_rate": 3.174922550645968e-05, "loss": 0.2599, "num_input_tokens_seen": 20140896, "step": 20010 }, { "epoch": 9.436586515794437, "grad_norm": 0.6887205243110657, "learning_rate": 3.173932086046922e-05, "loss": 0.0745, "num_input_tokens_seen": 20145952, "step": 20015 }, { "epoch": 9.438943894389439, "grad_norm": 1.5281747579574585, "learning_rate": 3.172941507362616e-05, "loss": 0.0934, "num_input_tokens_seen": 20151712, "step": 20020 }, { "epoch": 9.441301272984441, "grad_norm": 0.3669913113117218, "learning_rate": 3.171950814760739e-05, "loss": 0.0553, "num_input_tokens_seen": 20156320, "step": 20025 }, { "epoch": 9.443658651579444, "grad_norm": 0.9058700799942017, "learning_rate": 3.170960008408999e-05, "loss": 0.155, "num_input_tokens_seen": 20161408, "step": 20030 }, { "epoch": 9.446016030174446, "grad_norm": 1.6726016998291016, "learning_rate": 3.1699690884751206e-05, "loss": 0.1055, "num_input_tokens_seen": 20166976, "step": 20035 }, { "epoch": 9.448373408769449, "grad_norm": 0.20595762133598328, "learning_rate": 3.168978055126851e-05, "loss": 0.0226, "num_input_tokens_seen": 20171328, "step": 20040 }, { "epoch": 9.450730787364451, "grad_norm": 0.27115535736083984, "learning_rate": 3.167986908531956e-05, "loss": 0.0712, "num_input_tokens_seen": 20176256, "step": 20045 }, { "epoch": 9.453088165959453, "grad_norm": 0.8793925046920776, "learning_rate": 3.166995648858218e-05, "loss": 0.062, "num_input_tokens_seen": 20182080, "step": 20050 }, { "epoch": 9.455445544554456, "grad_norm": 0.8420673608779907, "learning_rate": 3.166004276273443e-05, "loss": 0.0425, "num_input_tokens_seen": 20187168, "step": 20055 }, { "epoch": 9.457802923149458, "grad_norm": 0.790733814239502, "learning_rate": 3.1650127909454516e-05, "loss": 0.066, "num_input_tokens_seen": 20192480, "step": 20060 }, { "epoch": 9.46016030174446, "grad_norm": 1.4573312997817993, "learning_rate": 3.164021193042085e-05, "loss": 0.1607, "num_input_tokens_seen": 20197920, "step": 20065 }, { "epoch": 9.462517680339463, "grad_norm": 0.5648476481437683, "learning_rate": 3.1630294827312067e-05, "loss": 0.1525, "num_input_tokens_seen": 20203008, "step": 20070 }, { "epoch": 9.464875058934465, "grad_norm": 0.04753285273909569, "learning_rate": 3.1620376601806935e-05, "loss": 0.0537, "num_input_tokens_seen": 20207680, "step": 20075 }, { "epoch": 9.467232437529468, "grad_norm": 1.5329252481460571, "learning_rate": 3.161045725558445e-05, "loss": 0.1362, "num_input_tokens_seen": 20212544, "step": 20080 }, { "epoch": 9.46958981612447, "grad_norm": 1.4126529693603516, "learning_rate": 3.160053679032379e-05, "loss": 0.1659, "num_input_tokens_seen": 20216608, "step": 20085 }, { "epoch": 9.471947194719473, "grad_norm": 2.116741895675659, "learning_rate": 3.1590615207704334e-05, "loss": 0.2875, "num_input_tokens_seen": 20220768, "step": 20090 }, { "epoch": 9.474304573314475, "grad_norm": 0.15318188071250916, "learning_rate": 3.158069250940561e-05, "loss": 0.1401, "num_input_tokens_seen": 20225728, "step": 20095 }, { "epoch": 9.476661951909477, "grad_norm": 1.5900253057479858, "learning_rate": 3.157076869710738e-05, "loss": 0.1374, "num_input_tokens_seen": 20230304, "step": 20100 }, { "epoch": 9.47901933050448, "grad_norm": 0.12099891155958176, "learning_rate": 3.1560843772489584e-05, "loss": 0.0361, "num_input_tokens_seen": 20235360, "step": 20105 }, { "epoch": 9.481376709099482, "grad_norm": 0.9373458623886108, "learning_rate": 3.1550917737232334e-05, "loss": 0.0457, "num_input_tokens_seen": 20240608, "step": 20110 }, { "epoch": 9.483734087694483, "grad_norm": 0.6653017997741699, "learning_rate": 3.154099059301593e-05, "loss": 0.2025, "num_input_tokens_seen": 20246112, "step": 20115 }, { "epoch": 9.486091466289485, "grad_norm": 0.15242090821266174, "learning_rate": 3.153106234152089e-05, "loss": 0.133, "num_input_tokens_seen": 20251104, "step": 20120 }, { "epoch": 9.488448844884488, "grad_norm": 0.30492672324180603, "learning_rate": 3.152113298442788e-05, "loss": 0.0475, "num_input_tokens_seen": 20256288, "step": 20125 }, { "epoch": 9.49080622347949, "grad_norm": 0.0215653907507658, "learning_rate": 3.151120252341778e-05, "loss": 0.0613, "num_input_tokens_seen": 20262048, "step": 20130 }, { "epoch": 9.493163602074493, "grad_norm": 0.5672243237495422, "learning_rate": 3.150127096017164e-05, "loss": 0.1553, "num_input_tokens_seen": 20266912, "step": 20135 }, { "epoch": 9.495520980669495, "grad_norm": 0.7161355018615723, "learning_rate": 3.149133829637072e-05, "loss": 0.1855, "num_input_tokens_seen": 20272800, "step": 20140 }, { "epoch": 9.497878359264497, "grad_norm": 0.15850235521793365, "learning_rate": 3.1481404533696436e-05, "loss": 0.0199, "num_input_tokens_seen": 20277440, "step": 20145 }, { "epoch": 9.5002357378595, "grad_norm": 0.329341858625412, "learning_rate": 3.1471469673830424e-05, "loss": 0.0812, "num_input_tokens_seen": 20282624, "step": 20150 }, { "epoch": 9.502593116454502, "grad_norm": 1.2305452823638916, "learning_rate": 3.146153371845447e-05, "loss": 0.1565, "num_input_tokens_seen": 20286848, "step": 20155 }, { "epoch": 9.504950495049505, "grad_norm": 0.06671243906021118, "learning_rate": 3.145159666925057e-05, "loss": 0.0808, "num_input_tokens_seen": 20291104, "step": 20160 }, { "epoch": 9.507307873644507, "grad_norm": 0.11983399093151093, "learning_rate": 3.144165852790088e-05, "loss": 0.0992, "num_input_tokens_seen": 20295392, "step": 20165 }, { "epoch": 9.50966525223951, "grad_norm": 0.09091667830944061, "learning_rate": 3.143171929608779e-05, "loss": 0.0896, "num_input_tokens_seen": 20300192, "step": 20170 }, { "epoch": 9.512022630834512, "grad_norm": 0.10469618439674377, "learning_rate": 3.1421778975493826e-05, "loss": 0.1648, "num_input_tokens_seen": 20304672, "step": 20175 }, { "epoch": 9.514380009429514, "grad_norm": 1.2931286096572876, "learning_rate": 3.141183756780172e-05, "loss": 0.0854, "num_input_tokens_seen": 20309408, "step": 20180 }, { "epoch": 9.516737388024517, "grad_norm": 0.5678479671478271, "learning_rate": 3.140189507469437e-05, "loss": 0.155, "num_input_tokens_seen": 20316320, "step": 20185 }, { "epoch": 9.519094766619519, "grad_norm": 0.440439373254776, "learning_rate": 3.139195149785489e-05, "loss": 0.1578, "num_input_tokens_seen": 20321248, "step": 20190 }, { "epoch": 9.521452145214521, "grad_norm": 3.3244550228118896, "learning_rate": 3.1382006838966546e-05, "loss": 0.0879, "num_input_tokens_seen": 20326944, "step": 20195 }, { "epoch": 9.523809523809524, "grad_norm": 1.671388864517212, "learning_rate": 3.1372061099712804e-05, "loss": 0.0948, "num_input_tokens_seen": 20332096, "step": 20200 }, { "epoch": 9.526166902404526, "grad_norm": 1.5449880361557007, "learning_rate": 3.136211428177731e-05, "loss": 0.2, "num_input_tokens_seen": 20337216, "step": 20205 }, { "epoch": 9.528524280999529, "grad_norm": 2.371427059173584, "learning_rate": 3.135216638684389e-05, "loss": 0.231, "num_input_tokens_seen": 20342336, "step": 20210 }, { "epoch": 9.530881659594531, "grad_norm": 0.22271542251110077, "learning_rate": 3.1342217416596566e-05, "loss": 0.2855, "num_input_tokens_seen": 20347616, "step": 20215 }, { "epoch": 9.533239038189533, "grad_norm": 0.7494765520095825, "learning_rate": 3.13322673727195e-05, "loss": 0.0508, "num_input_tokens_seen": 20352096, "step": 20220 }, { "epoch": 9.535596416784536, "grad_norm": 0.20126372575759888, "learning_rate": 3.13223162568971e-05, "loss": 0.055, "num_input_tokens_seen": 20357376, "step": 20225 }, { "epoch": 9.537953795379538, "grad_norm": 0.3383217453956604, "learning_rate": 3.1312364070813896e-05, "loss": 0.0507, "num_input_tokens_seen": 20361824, "step": 20230 }, { "epoch": 9.54031117397454, "grad_norm": 0.5627388954162598, "learning_rate": 3.130241081615464e-05, "loss": 0.0614, "num_input_tokens_seen": 20366176, "step": 20235 }, { "epoch": 9.542668552569543, "grad_norm": 0.40680575370788574, "learning_rate": 3.129245649460423e-05, "loss": 0.0514, "num_input_tokens_seen": 20372352, "step": 20240 }, { "epoch": 9.545025931164545, "grad_norm": 0.07877038419246674, "learning_rate": 3.128250110784778e-05, "loss": 0.076, "num_input_tokens_seen": 20377216, "step": 20245 }, { "epoch": 9.547383309759548, "grad_norm": 0.12657098472118378, "learning_rate": 3.127254465757056e-05, "loss": 0.0382, "num_input_tokens_seen": 20382368, "step": 20250 }, { "epoch": 9.54974068835455, "grad_norm": 0.032778672873973846, "learning_rate": 3.126258714545804e-05, "loss": 0.098, "num_input_tokens_seen": 20387072, "step": 20255 }, { "epoch": 9.552098066949553, "grad_norm": 0.5960850715637207, "learning_rate": 3.1252628573195845e-05, "loss": 0.1154, "num_input_tokens_seen": 20392256, "step": 20260 }, { "epoch": 9.554455445544555, "grad_norm": 0.31571081280708313, "learning_rate": 3.124266894246979e-05, "loss": 0.0238, "num_input_tokens_seen": 20396960, "step": 20265 }, { "epoch": 9.556812824139557, "grad_norm": 0.4664170444011688, "learning_rate": 3.123270825496589e-05, "loss": 0.0883, "num_input_tokens_seen": 20401472, "step": 20270 }, { "epoch": 9.55917020273456, "grad_norm": 0.45604291558265686, "learning_rate": 3.12227465123703e-05, "loss": 0.092, "num_input_tokens_seen": 20406144, "step": 20275 }, { "epoch": 9.561527581329562, "grad_norm": 1.5464173555374146, "learning_rate": 3.121278371636938e-05, "loss": 0.1539, "num_input_tokens_seen": 20410816, "step": 20280 }, { "epoch": 9.563884959924565, "grad_norm": 0.8406140804290771, "learning_rate": 3.1202819868649666e-05, "loss": 0.137, "num_input_tokens_seen": 20415712, "step": 20285 }, { "epoch": 9.566242338519567, "grad_norm": 0.790010392665863, "learning_rate": 3.119285497089787e-05, "loss": 0.0429, "num_input_tokens_seen": 20420704, "step": 20290 }, { "epoch": 9.56859971711457, "grad_norm": 1.691705346107483, "learning_rate": 3.118288902480087e-05, "loss": 0.1227, "num_input_tokens_seen": 20425504, "step": 20295 }, { "epoch": 9.570957095709572, "grad_norm": 0.13527269661426544, "learning_rate": 3.117292203204574e-05, "loss": 0.105, "num_input_tokens_seen": 20430656, "step": 20300 }, { "epoch": 9.573314474304574, "grad_norm": 0.42389851808547974, "learning_rate": 3.1162953994319716e-05, "loss": 0.0181, "num_input_tokens_seen": 20435680, "step": 20305 }, { "epoch": 9.575671852899575, "grad_norm": 0.24368393421173096, "learning_rate": 3.115298491331022e-05, "loss": 0.1292, "num_input_tokens_seen": 20440512, "step": 20310 }, { "epoch": 9.578029231494579, "grad_norm": 0.07837347686290741, "learning_rate": 3.114301479070486e-05, "loss": 0.0512, "num_input_tokens_seen": 20445472, "step": 20315 }, { "epoch": 9.58038661008958, "grad_norm": 0.9258078932762146, "learning_rate": 3.11330436281914e-05, "loss": 0.0505, "num_input_tokens_seen": 20451232, "step": 20320 }, { "epoch": 9.582743988684582, "grad_norm": 1.3479984998703003, "learning_rate": 3.112307142745778e-05, "loss": 0.2404, "num_input_tokens_seen": 20456032, "step": 20325 }, { "epoch": 9.585101367279584, "grad_norm": 0.4083902835845947, "learning_rate": 3.111309819019213e-05, "loss": 0.0882, "num_input_tokens_seen": 20461056, "step": 20330 }, { "epoch": 9.587458745874587, "grad_norm": 1.5414111614227295, "learning_rate": 3.110312391808275e-05, "loss": 0.164, "num_input_tokens_seen": 20466144, "step": 20335 }, { "epoch": 9.58981612446959, "grad_norm": 0.2815419137477875, "learning_rate": 3.1093148612818126e-05, "loss": 0.1881, "num_input_tokens_seen": 20472192, "step": 20340 }, { "epoch": 9.592173503064592, "grad_norm": 1.5159715414047241, "learning_rate": 3.108317227608688e-05, "loss": 0.2007, "num_input_tokens_seen": 20476992, "step": 20345 }, { "epoch": 9.594530881659594, "grad_norm": 0.22223816812038422, "learning_rate": 3.107319490957787e-05, "loss": 0.0565, "num_input_tokens_seen": 20482848, "step": 20350 }, { "epoch": 9.596888260254596, "grad_norm": 0.8351522088050842, "learning_rate": 3.106321651498007e-05, "loss": 0.1515, "num_input_tokens_seen": 20488000, "step": 20355 }, { "epoch": 9.599245638849599, "grad_norm": 1.121368646621704, "learning_rate": 3.105323709398267e-05, "loss": 0.2603, "num_input_tokens_seen": 20492864, "step": 20360 }, { "epoch": 9.601603017444601, "grad_norm": 0.22383131086826324, "learning_rate": 3.1043256648275e-05, "loss": 0.0667, "num_input_tokens_seen": 20497216, "step": 20365 }, { "epoch": 9.603960396039604, "grad_norm": 1.274118423461914, "learning_rate": 3.103327517954659e-05, "loss": 0.1184, "num_input_tokens_seen": 20502560, "step": 20370 }, { "epoch": 9.606317774634606, "grad_norm": 0.08497676998376846, "learning_rate": 3.102329268948714e-05, "loss": 0.0719, "num_input_tokens_seen": 20508000, "step": 20375 }, { "epoch": 9.608675153229608, "grad_norm": 0.6120868921279907, "learning_rate": 3.10133091797865e-05, "loss": 0.1512, "num_input_tokens_seen": 20514432, "step": 20380 }, { "epoch": 9.61103253182461, "grad_norm": 1.0081313848495483, "learning_rate": 3.100332465213471e-05, "loss": 0.1079, "num_input_tokens_seen": 20519872, "step": 20385 }, { "epoch": 9.613389910419613, "grad_norm": 0.938000500202179, "learning_rate": 3.0993339108222e-05, "loss": 0.0744, "num_input_tokens_seen": 20523872, "step": 20390 }, { "epoch": 9.615747289014616, "grad_norm": 0.014699839986860752, "learning_rate": 3.098335254973872e-05, "loss": 0.1662, "num_input_tokens_seen": 20528320, "step": 20395 }, { "epoch": 9.618104667609618, "grad_norm": 1.3510109186172485, "learning_rate": 3.097336497837547e-05, "loss": 0.1484, "num_input_tokens_seen": 20534016, "step": 20400 }, { "epoch": 9.62046204620462, "grad_norm": 0.13081185519695282, "learning_rate": 3.096337639582294e-05, "loss": 0.0165, "num_input_tokens_seen": 20539008, "step": 20405 }, { "epoch": 9.622819424799623, "grad_norm": 1.6850059032440186, "learning_rate": 3.0953386803772036e-05, "loss": 0.3449, "num_input_tokens_seen": 20544416, "step": 20410 }, { "epoch": 9.625176803394625, "grad_norm": 0.2708599865436554, "learning_rate": 3.0943396203913826e-05, "loss": 0.0798, "num_input_tokens_seen": 20549664, "step": 20415 }, { "epoch": 9.627534181989628, "grad_norm": 0.13616864383220673, "learning_rate": 3.093340459793956e-05, "loss": 0.0242, "num_input_tokens_seen": 20554848, "step": 20420 }, { "epoch": 9.62989156058463, "grad_norm": 0.18078011274337769, "learning_rate": 3.0923411987540636e-05, "loss": 0.0258, "num_input_tokens_seen": 20559840, "step": 20425 }, { "epoch": 9.632248939179632, "grad_norm": 3.4493768215179443, "learning_rate": 3.091341837440863e-05, "loss": 0.1581, "num_input_tokens_seen": 20563840, "step": 20430 }, { "epoch": 9.634606317774635, "grad_norm": 0.06397560238838196, "learning_rate": 3.090342376023531e-05, "loss": 0.0375, "num_input_tokens_seen": 20568448, "step": 20435 }, { "epoch": 9.636963696369637, "grad_norm": 1.9638285636901855, "learning_rate": 3.089342814671257e-05, "loss": 0.1926, "num_input_tokens_seen": 20573824, "step": 20440 }, { "epoch": 9.63932107496464, "grad_norm": 0.7815367579460144, "learning_rate": 3.0883431535532516e-05, "loss": 0.0522, "num_input_tokens_seen": 20577984, "step": 20445 }, { "epoch": 9.641678453559642, "grad_norm": 1.6655110120773315, "learning_rate": 3.08734339283874e-05, "loss": 0.2802, "num_input_tokens_seen": 20583520, "step": 20450 }, { "epoch": 9.644035832154644, "grad_norm": 0.31216880679130554, "learning_rate": 3.0863435326969635e-05, "loss": 0.2941, "num_input_tokens_seen": 20587840, "step": 20455 }, { "epoch": 9.646393210749647, "grad_norm": 0.23618142306804657, "learning_rate": 3.085343573297184e-05, "loss": 0.0828, "num_input_tokens_seen": 20592480, "step": 20460 }, { "epoch": 9.64875058934465, "grad_norm": 0.336776465177536, "learning_rate": 3.084343514808674e-05, "loss": 0.3211, "num_input_tokens_seen": 20596096, "step": 20465 }, { "epoch": 9.651107967939652, "grad_norm": 0.5388619303703308, "learning_rate": 3.08334335740073e-05, "loss": 0.1785, "num_input_tokens_seen": 20600928, "step": 20470 }, { "epoch": 9.653465346534654, "grad_norm": 0.01358115766197443, "learning_rate": 3.0823431012426595e-05, "loss": 0.1228, "num_input_tokens_seen": 20605024, "step": 20475 }, { "epoch": 9.655822725129656, "grad_norm": 0.6576743721961975, "learning_rate": 3.0813427465037906e-05, "loss": 0.1692, "num_input_tokens_seen": 20610560, "step": 20480 }, { "epoch": 9.658180103724659, "grad_norm": 0.09778048098087311, "learning_rate": 3.080342293353465e-05, "loss": 0.1178, "num_input_tokens_seen": 20616608, "step": 20485 }, { "epoch": 9.660537482319661, "grad_norm": 0.3134918808937073, "learning_rate": 3.079341741961043e-05, "loss": 0.0766, "num_input_tokens_seen": 20622240, "step": 20490 }, { "epoch": 9.662894860914664, "grad_norm": 0.4877029359340668, "learning_rate": 3.0783410924959e-05, "loss": 0.1106, "num_input_tokens_seen": 20627584, "step": 20495 }, { "epoch": 9.665252239509666, "grad_norm": 0.35166388750076294, "learning_rate": 3.07734034512743e-05, "loss": 0.0729, "num_input_tokens_seen": 20632480, "step": 20500 }, { "epoch": 9.667609618104667, "grad_norm": 0.06532789766788483, "learning_rate": 3.076339500025042e-05, "loss": 0.1441, "num_input_tokens_seen": 20637408, "step": 20505 }, { "epoch": 9.66996699669967, "grad_norm": 1.7853195667266846, "learning_rate": 3.075338557358163e-05, "loss": 0.2678, "num_input_tokens_seen": 20643456, "step": 20510 }, { "epoch": 9.672324375294671, "grad_norm": 1.7148405313491821, "learning_rate": 3.074337517296234e-05, "loss": 0.0894, "num_input_tokens_seen": 20648224, "step": 20515 }, { "epoch": 9.674681753889674, "grad_norm": 0.38708510994911194, "learning_rate": 3.0733363800087154e-05, "loss": 0.2162, "num_input_tokens_seen": 20653824, "step": 20520 }, { "epoch": 9.677039132484676, "grad_norm": 1.2128242254257202, "learning_rate": 3.072335145665083e-05, "loss": 0.1529, "num_input_tokens_seen": 20658048, "step": 20525 }, { "epoch": 9.679396511079679, "grad_norm": 1.2779614925384521, "learning_rate": 3.071333814434827e-05, "loss": 0.2537, "num_input_tokens_seen": 20663744, "step": 20530 }, { "epoch": 9.681753889674681, "grad_norm": 0.20045581459999084, "learning_rate": 3.0703323864874576e-05, "loss": 0.0557, "num_input_tokens_seen": 20668992, "step": 20535 }, { "epoch": 9.684111268269683, "grad_norm": 0.04649519547820091, "learning_rate": 3.069330861992499e-05, "loss": 0.1298, "num_input_tokens_seen": 20673632, "step": 20540 }, { "epoch": 9.686468646864686, "grad_norm": 0.11670902371406555, "learning_rate": 3.0683292411194925e-05, "loss": 0.0817, "num_input_tokens_seen": 20678912, "step": 20545 }, { "epoch": 9.688826025459688, "grad_norm": 0.0978703573346138, "learning_rate": 3.067327524037994e-05, "loss": 0.1251, "num_input_tokens_seen": 20683680, "step": 20550 }, { "epoch": 9.69118340405469, "grad_norm": 0.04427862539887428, "learning_rate": 3.066325710917579e-05, "loss": 0.0521, "num_input_tokens_seen": 20688128, "step": 20555 }, { "epoch": 9.693540782649693, "grad_norm": 0.12281288951635361, "learning_rate": 3.0653238019278365e-05, "loss": 0.1589, "num_input_tokens_seen": 20693856, "step": 20560 }, { "epoch": 9.695898161244696, "grad_norm": 0.5853700637817383, "learning_rate": 3.0643217972383745e-05, "loss": 0.1956, "num_input_tokens_seen": 20698688, "step": 20565 }, { "epoch": 9.698255539839698, "grad_norm": 0.029744725674390793, "learning_rate": 3.063319697018812e-05, "loss": 0.0518, "num_input_tokens_seen": 20703840, "step": 20570 }, { "epoch": 9.7006129184347, "grad_norm": 0.9098172187805176, "learning_rate": 3.06231750143879e-05, "loss": 0.1183, "num_input_tokens_seen": 20708576, "step": 20575 }, { "epoch": 9.702970297029703, "grad_norm": 0.04596869274973869, "learning_rate": 3.061315210667963e-05, "loss": 0.0782, "num_input_tokens_seen": 20713120, "step": 20580 }, { "epoch": 9.705327675624705, "grad_norm": 0.8069584965705872, "learning_rate": 3.060312824876001e-05, "loss": 0.1282, "num_input_tokens_seen": 20717984, "step": 20585 }, { "epoch": 9.707685054219708, "grad_norm": 0.7313074469566345, "learning_rate": 3.0593103442325916e-05, "loss": 0.0607, "num_input_tokens_seen": 20723648, "step": 20590 }, { "epoch": 9.71004243281471, "grad_norm": 0.7662326097488403, "learning_rate": 3.0583077689074366e-05, "loss": 0.2808, "num_input_tokens_seen": 20728000, "step": 20595 }, { "epoch": 9.712399811409712, "grad_norm": 0.6954389810562134, "learning_rate": 3.057305099070257e-05, "loss": 0.1403, "num_input_tokens_seen": 20733152, "step": 20600 }, { "epoch": 9.714757190004715, "grad_norm": 0.5122353434562683, "learning_rate": 3.056302334890786e-05, "loss": 0.1342, "num_input_tokens_seen": 20738944, "step": 20605 }, { "epoch": 9.717114568599717, "grad_norm": 0.821670651435852, "learning_rate": 3.055299476538776e-05, "loss": 0.2181, "num_input_tokens_seen": 20744192, "step": 20610 }, { "epoch": 9.71947194719472, "grad_norm": 1.4169261455535889, "learning_rate": 3.054296524183992e-05, "loss": 0.0847, "num_input_tokens_seen": 20748864, "step": 20615 }, { "epoch": 9.721829325789722, "grad_norm": 1.0597997903823853, "learning_rate": 3.053293477996219e-05, "loss": 0.2352, "num_input_tokens_seen": 20754144, "step": 20620 }, { "epoch": 9.724186704384724, "grad_norm": 0.4183385372161865, "learning_rate": 3.052290338145253e-05, "loss": 0.0879, "num_input_tokens_seen": 20758592, "step": 20625 }, { "epoch": 9.726544082979727, "grad_norm": 0.6819756031036377, "learning_rate": 3.0512871048009117e-05, "loss": 0.1359, "num_input_tokens_seen": 20763520, "step": 20630 }, { "epoch": 9.72890146157473, "grad_norm": 0.15582279860973358, "learning_rate": 3.0502837781330228e-05, "loss": 0.2109, "num_input_tokens_seen": 20768480, "step": 20635 }, { "epoch": 9.731258840169732, "grad_norm": 0.6261126399040222, "learning_rate": 3.049280358311434e-05, "loss": 0.0872, "num_input_tokens_seen": 20773760, "step": 20640 }, { "epoch": 9.733616218764734, "grad_norm": 0.053167615085840225, "learning_rate": 3.0482768455060063e-05, "loss": 0.2955, "num_input_tokens_seen": 20777696, "step": 20645 }, { "epoch": 9.735973597359736, "grad_norm": 0.09153018146753311, "learning_rate": 3.047273239886619e-05, "loss": 0.0886, "num_input_tokens_seen": 20781728, "step": 20650 }, { "epoch": 9.738330975954739, "grad_norm": 0.23712705075740814, "learning_rate": 3.046269541623163e-05, "loss": 0.2982, "num_input_tokens_seen": 20787200, "step": 20655 }, { "epoch": 9.740688354549741, "grad_norm": 0.48862746357917786, "learning_rate": 3.0452657508855493e-05, "loss": 0.2494, "num_input_tokens_seen": 20791296, "step": 20660 }, { "epoch": 9.743045733144744, "grad_norm": 0.6083592176437378, "learning_rate": 3.0442618678437017e-05, "loss": 0.1077, "num_input_tokens_seen": 20795456, "step": 20665 }, { "epoch": 9.745403111739746, "grad_norm": 2.192009687423706, "learning_rate": 3.043257892667562e-05, "loss": 0.2269, "num_input_tokens_seen": 20799648, "step": 20670 }, { "epoch": 9.747760490334748, "grad_norm": 0.3385472893714905, "learning_rate": 3.0422538255270844e-05, "loss": 0.0412, "num_input_tokens_seen": 20804480, "step": 20675 }, { "epoch": 9.75011786892975, "grad_norm": 1.2937151193618774, "learning_rate": 3.041249666592241e-05, "loss": 0.136, "num_input_tokens_seen": 20810016, "step": 20680 }, { "epoch": 9.752475247524753, "grad_norm": 0.4269333779811859, "learning_rate": 3.0402454160330184e-05, "loss": 0.1742, "num_input_tokens_seen": 20814976, "step": 20685 }, { "epoch": 9.754832626119756, "grad_norm": 1.1862047910690308, "learning_rate": 3.039241074019421e-05, "loss": 0.1559, "num_input_tokens_seen": 20819104, "step": 20690 }, { "epoch": 9.757190004714758, "grad_norm": 0.8820156455039978, "learning_rate": 3.0382366407214646e-05, "loss": 0.1193, "num_input_tokens_seen": 20823904, "step": 20695 }, { "epoch": 9.75954738330976, "grad_norm": 0.11475783586502075, "learning_rate": 3.0372321163091838e-05, "loss": 0.1631, "num_input_tokens_seen": 20830048, "step": 20700 }, { "epoch": 9.761904761904763, "grad_norm": 0.578546941280365, "learning_rate": 3.0362275009526275e-05, "loss": 0.2431, "num_input_tokens_seen": 20834688, "step": 20705 }, { "epoch": 9.764262140499763, "grad_norm": 1.1044921875, "learning_rate": 3.0352227948218603e-05, "loss": 0.0483, "num_input_tokens_seen": 20839040, "step": 20710 }, { "epoch": 9.766619519094768, "grad_norm": 0.08480820804834366, "learning_rate": 3.0342179980869616e-05, "loss": 0.1085, "num_input_tokens_seen": 20843840, "step": 20715 }, { "epoch": 9.768976897689768, "grad_norm": 0.8121218085289001, "learning_rate": 3.0332131109180255e-05, "loss": 0.1137, "num_input_tokens_seen": 20848512, "step": 20720 }, { "epoch": 9.77133427628477, "grad_norm": 0.6889327168464661, "learning_rate": 3.0322081334851637e-05, "loss": 0.1653, "num_input_tokens_seen": 20853120, "step": 20725 }, { "epoch": 9.773691654879773, "grad_norm": 1.9950675964355469, "learning_rate": 3.0312030659585013e-05, "loss": 0.1254, "num_input_tokens_seen": 20857536, "step": 20730 }, { "epoch": 9.776049033474775, "grad_norm": 0.29036441445350647, "learning_rate": 3.0301979085081788e-05, "loss": 0.1776, "num_input_tokens_seen": 20862688, "step": 20735 }, { "epoch": 9.778406412069778, "grad_norm": 0.5911231637001038, "learning_rate": 3.0291926613043526e-05, "loss": 0.2376, "num_input_tokens_seen": 20867712, "step": 20740 }, { "epoch": 9.78076379066478, "grad_norm": 0.07166549563407898, "learning_rate": 3.0281873245171943e-05, "loss": 0.0436, "num_input_tokens_seen": 20872544, "step": 20745 }, { "epoch": 9.783121169259783, "grad_norm": 0.10763120651245117, "learning_rate": 3.02718189831689e-05, "loss": 0.2089, "num_input_tokens_seen": 20877504, "step": 20750 }, { "epoch": 9.785478547854785, "grad_norm": 0.15614625811576843, "learning_rate": 3.0261763828736417e-05, "loss": 0.0488, "num_input_tokens_seen": 20882656, "step": 20755 }, { "epoch": 9.787835926449787, "grad_norm": 0.5041539072990417, "learning_rate": 3.025170778357665e-05, "loss": 0.1431, "num_input_tokens_seen": 20887136, "step": 20760 }, { "epoch": 9.79019330504479, "grad_norm": 0.09348580241203308, "learning_rate": 3.024165084939193e-05, "loss": 0.0704, "num_input_tokens_seen": 20891488, "step": 20765 }, { "epoch": 9.792550683639792, "grad_norm": 1.2902520895004272, "learning_rate": 3.0231593027884712e-05, "loss": 0.0512, "num_input_tokens_seen": 20896992, "step": 20770 }, { "epoch": 9.794908062234795, "grad_norm": 1.9844149351119995, "learning_rate": 3.0221534320757633e-05, "loss": 0.3546, "num_input_tokens_seen": 20901504, "step": 20775 }, { "epoch": 9.797265440829797, "grad_norm": 1.3936809301376343, "learning_rate": 3.0211474729713442e-05, "loss": 0.1012, "num_input_tokens_seen": 20906880, "step": 20780 }, { "epoch": 9.7996228194248, "grad_norm": 0.5314071774482727, "learning_rate": 3.0201414256455068e-05, "loss": 0.1235, "num_input_tokens_seen": 20911776, "step": 20785 }, { "epoch": 9.801980198019802, "grad_norm": 0.46903660893440247, "learning_rate": 3.0191352902685572e-05, "loss": 0.0774, "num_input_tokens_seen": 20917376, "step": 20790 }, { "epoch": 9.804337576614804, "grad_norm": 0.2572651505470276, "learning_rate": 3.0181290670108183e-05, "loss": 0.1283, "num_input_tokens_seen": 20922048, "step": 20795 }, { "epoch": 9.806694955209807, "grad_norm": 1.3270725011825562, "learning_rate": 3.0171227560426245e-05, "loss": 0.1757, "num_input_tokens_seen": 20927008, "step": 20800 }, { "epoch": 9.809052333804809, "grad_norm": 0.25019049644470215, "learning_rate": 3.0161163575343288e-05, "loss": 0.0764, "num_input_tokens_seen": 20930944, "step": 20805 }, { "epoch": 9.811409712399811, "grad_norm": 0.0566551573574543, "learning_rate": 3.015109871656297e-05, "loss": 0.0697, "num_input_tokens_seen": 20935296, "step": 20810 }, { "epoch": 9.813767090994814, "grad_norm": 1.9109188318252563, "learning_rate": 3.0141032985789103e-05, "loss": 0.2282, "num_input_tokens_seen": 20940064, "step": 20815 }, { "epoch": 9.816124469589816, "grad_norm": 0.17730669677257538, "learning_rate": 3.0130966384725638e-05, "loss": 0.0474, "num_input_tokens_seen": 20944064, "step": 20820 }, { "epoch": 9.818481848184819, "grad_norm": 0.10714097321033478, "learning_rate": 3.012089891507669e-05, "loss": 0.0622, "num_input_tokens_seen": 20948672, "step": 20825 }, { "epoch": 9.820839226779821, "grad_norm": 1.4897154569625854, "learning_rate": 3.0110830578546497e-05, "loss": 0.1287, "num_input_tokens_seen": 20953696, "step": 20830 }, { "epoch": 9.823196605374823, "grad_norm": 1.2054204940795898, "learning_rate": 3.010076137683946e-05, "loss": 0.1902, "num_input_tokens_seen": 20959296, "step": 20835 }, { "epoch": 9.825553983969826, "grad_norm": 0.33977261185646057, "learning_rate": 3.0090691311660137e-05, "loss": 0.0555, "num_input_tokens_seen": 20964480, "step": 20840 }, { "epoch": 9.827911362564828, "grad_norm": 0.4704458713531494, "learning_rate": 3.00806203847132e-05, "loss": 0.0835, "num_input_tokens_seen": 20969376, "step": 20845 }, { "epoch": 9.83026874115983, "grad_norm": 0.2912880480289459, "learning_rate": 3.00705485977035e-05, "loss": 0.1277, "num_input_tokens_seen": 20975648, "step": 20850 }, { "epoch": 9.832626119754833, "grad_norm": 0.2401178628206253, "learning_rate": 3.0060475952336015e-05, "loss": 0.0547, "num_input_tokens_seen": 20981632, "step": 20855 }, { "epoch": 9.834983498349835, "grad_norm": 0.07122251391410828, "learning_rate": 3.005040245031588e-05, "loss": 0.078, "num_input_tokens_seen": 20987584, "step": 20860 }, { "epoch": 9.837340876944838, "grad_norm": 0.2559572756290436, "learning_rate": 3.004032809334835e-05, "loss": 0.0386, "num_input_tokens_seen": 20992256, "step": 20865 }, { "epoch": 9.83969825553984, "grad_norm": 1.1157997846603394, "learning_rate": 3.003025288313886e-05, "loss": 0.2037, "num_input_tokens_seen": 20997504, "step": 20870 }, { "epoch": 9.842055634134843, "grad_norm": 1.4139155149459839, "learning_rate": 3.0020176821392964e-05, "loss": 0.1763, "num_input_tokens_seen": 21002752, "step": 20875 }, { "epoch": 9.844413012729845, "grad_norm": 0.19358307123184204, "learning_rate": 3.0010099909816374e-05, "loss": 0.0867, "num_input_tokens_seen": 21008608, "step": 20880 }, { "epoch": 9.846770391324847, "grad_norm": 0.33037063479423523, "learning_rate": 3.000002215011493e-05, "loss": 0.1509, "num_input_tokens_seen": 21012928, "step": 20885 }, { "epoch": 9.84912776991985, "grad_norm": 1.4802558422088623, "learning_rate": 2.9989943543994635e-05, "loss": 0.1756, "num_input_tokens_seen": 21018080, "step": 20890 }, { "epoch": 9.851485148514852, "grad_norm": 0.7896214127540588, "learning_rate": 2.997986409316162e-05, "loss": 0.0921, "num_input_tokens_seen": 21022112, "step": 20895 }, { "epoch": 9.853842527109855, "grad_norm": 0.23329824209213257, "learning_rate": 2.9969783799322172e-05, "loss": 0.0953, "num_input_tokens_seen": 21027712, "step": 20900 }, { "epoch": 9.856199905704855, "grad_norm": 0.17134730517864227, "learning_rate": 2.9959702664182704e-05, "loss": 0.0569, "num_input_tokens_seen": 21032384, "step": 20905 }, { "epoch": 9.85855728429986, "grad_norm": 1.3082258701324463, "learning_rate": 2.9949620689449786e-05, "loss": 0.0835, "num_input_tokens_seen": 21037824, "step": 20910 }, { "epoch": 9.86091466289486, "grad_norm": 1.0425570011138916, "learning_rate": 2.9939537876830124e-05, "loss": 0.144, "num_input_tokens_seen": 21042816, "step": 20915 }, { "epoch": 9.863272041489862, "grad_norm": 0.0734013244509697, "learning_rate": 2.9929454228030573e-05, "loss": 0.1557, "num_input_tokens_seen": 21047520, "step": 20920 }, { "epoch": 9.865629420084865, "grad_norm": 0.352701336145401, "learning_rate": 2.991936974475812e-05, "loss": 0.0372, "num_input_tokens_seen": 21052512, "step": 20925 }, { "epoch": 9.867986798679867, "grad_norm": 2.036062240600586, "learning_rate": 2.9909284428719886e-05, "loss": 0.0721, "num_input_tokens_seen": 21057952, "step": 20930 }, { "epoch": 9.87034417727487, "grad_norm": 1.895516276359558, "learning_rate": 2.9899198281623166e-05, "loss": 0.0422, "num_input_tokens_seen": 21063200, "step": 20935 }, { "epoch": 9.872701555869872, "grad_norm": 2.138477087020874, "learning_rate": 2.988911130517535e-05, "loss": 0.1933, "num_input_tokens_seen": 21069024, "step": 20940 }, { "epoch": 9.875058934464874, "grad_norm": 0.04420320317149162, "learning_rate": 2.9879023501084013e-05, "loss": 0.064, "num_input_tokens_seen": 21073408, "step": 20945 }, { "epoch": 9.877416313059877, "grad_norm": 0.2243540734052658, "learning_rate": 2.9868934871056832e-05, "loss": 0.0818, "num_input_tokens_seen": 21078688, "step": 20950 }, { "epoch": 9.87977369165488, "grad_norm": 0.4637893736362457, "learning_rate": 2.985884541680164e-05, "loss": 0.1675, "num_input_tokens_seen": 21082912, "step": 20955 }, { "epoch": 9.882131070249882, "grad_norm": 0.049171414226293564, "learning_rate": 2.9848755140026423e-05, "loss": 0.1823, "num_input_tokens_seen": 21088192, "step": 20960 }, { "epoch": 9.884488448844884, "grad_norm": 1.5330036878585815, "learning_rate": 2.983866404243929e-05, "loss": 0.2171, "num_input_tokens_seen": 21092768, "step": 20965 }, { "epoch": 9.886845827439886, "grad_norm": 1.1942741870880127, "learning_rate": 2.982857212574849e-05, "loss": 0.1868, "num_input_tokens_seen": 21097888, "step": 20970 }, { "epoch": 9.889203206034889, "grad_norm": 0.6606881022453308, "learning_rate": 2.9818479391662412e-05, "loss": 0.0864, "num_input_tokens_seen": 21102400, "step": 20975 }, { "epoch": 9.891560584629891, "grad_norm": 0.6692224740982056, "learning_rate": 2.9808385841889587e-05, "loss": 0.0867, "num_input_tokens_seen": 21108160, "step": 20980 }, { "epoch": 9.893917963224894, "grad_norm": 0.07997194677591324, "learning_rate": 2.9798291478138685e-05, "loss": 0.0405, "num_input_tokens_seen": 21113184, "step": 20985 }, { "epoch": 9.896275341819896, "grad_norm": 0.14946028590202332, "learning_rate": 2.9788196302118498e-05, "loss": 0.1871, "num_input_tokens_seen": 21118208, "step": 20990 }, { "epoch": 9.898632720414899, "grad_norm": 0.09053807705640793, "learning_rate": 2.9778100315537976e-05, "loss": 0.1184, "num_input_tokens_seen": 21123488, "step": 20995 }, { "epoch": 9.900990099009901, "grad_norm": 1.4769574403762817, "learning_rate": 2.9768003520106204e-05, "loss": 0.093, "num_input_tokens_seen": 21128800, "step": 21000 }, { "epoch": 9.903347477604903, "grad_norm": 1.4678295850753784, "learning_rate": 2.9757905917532393e-05, "loss": 0.1258, "num_input_tokens_seen": 21132352, "step": 21005 }, { "epoch": 9.905704856199906, "grad_norm": 0.33490997552871704, "learning_rate": 2.9747807509525892e-05, "loss": 0.1062, "num_input_tokens_seen": 21139040, "step": 21010 }, { "epoch": 9.908062234794908, "grad_norm": 1.394860863685608, "learning_rate": 2.973770829779619e-05, "loss": 0.1406, "num_input_tokens_seen": 21143840, "step": 21015 }, { "epoch": 9.91041961338991, "grad_norm": 0.07958400249481201, "learning_rate": 2.9727608284052917e-05, "loss": 0.1104, "num_input_tokens_seen": 21149664, "step": 21020 }, { "epoch": 9.912776991984913, "grad_norm": 1.3424160480499268, "learning_rate": 2.9717507470005833e-05, "loss": 0.0649, "num_input_tokens_seen": 21156064, "step": 21025 }, { "epoch": 9.915134370579915, "grad_norm": 1.1832749843597412, "learning_rate": 2.9707405857364835e-05, "loss": 0.2231, "num_input_tokens_seen": 21160928, "step": 21030 }, { "epoch": 9.917491749174918, "grad_norm": 1.8133071660995483, "learning_rate": 2.9697303447839946e-05, "loss": 0.1484, "num_input_tokens_seen": 21165248, "step": 21035 }, { "epoch": 9.91984912776992, "grad_norm": 1.280775785446167, "learning_rate": 2.9687200243141346e-05, "loss": 0.1992, "num_input_tokens_seen": 21170112, "step": 21040 }, { "epoch": 9.922206506364923, "grad_norm": 0.7496206164360046, "learning_rate": 2.9677096244979323e-05, "loss": 0.1284, "num_input_tokens_seen": 21175584, "step": 21045 }, { "epoch": 9.924563884959925, "grad_norm": 1.131786584854126, "learning_rate": 2.966699145506433e-05, "loss": 0.0747, "num_input_tokens_seen": 21179744, "step": 21050 }, { "epoch": 9.926921263554927, "grad_norm": 1.4439730644226074, "learning_rate": 2.965688587510691e-05, "loss": 0.2991, "num_input_tokens_seen": 21184704, "step": 21055 }, { "epoch": 9.92927864214993, "grad_norm": 1.2883943319320679, "learning_rate": 2.9646779506817785e-05, "loss": 0.0972, "num_input_tokens_seen": 21189664, "step": 21060 }, { "epoch": 9.931636020744932, "grad_norm": 0.880553126335144, "learning_rate": 2.963667235190779e-05, "loss": 0.1119, "num_input_tokens_seen": 21194240, "step": 21065 }, { "epoch": 9.933993399339935, "grad_norm": 0.8680047392845154, "learning_rate": 2.9626564412087892e-05, "loss": 0.078, "num_input_tokens_seen": 21199552, "step": 21070 }, { "epoch": 9.936350777934937, "grad_norm": 0.352031409740448, "learning_rate": 2.9616455689069194e-05, "loss": 0.1068, "num_input_tokens_seen": 21203232, "step": 21075 }, { "epoch": 9.93870815652994, "grad_norm": 0.06056329980492592, "learning_rate": 2.9606346184562927e-05, "loss": 0.0492, "num_input_tokens_seen": 21209696, "step": 21080 }, { "epoch": 9.941065535124942, "grad_norm": 0.37791168689727783, "learning_rate": 2.9596235900280463e-05, "loss": 0.0232, "num_input_tokens_seen": 21214688, "step": 21085 }, { "epoch": 9.943422913719944, "grad_norm": 0.8724961280822754, "learning_rate": 2.9586124837933303e-05, "loss": 0.1123, "num_input_tokens_seen": 21221184, "step": 21090 }, { "epoch": 9.945780292314947, "grad_norm": 1.2929972410202026, "learning_rate": 2.9576012999233078e-05, "loss": 0.1217, "num_input_tokens_seen": 21225408, "step": 21095 }, { "epoch": 9.948137670909949, "grad_norm": 2.023543119430542, "learning_rate": 2.9565900385891542e-05, "loss": 0.1691, "num_input_tokens_seen": 21230400, "step": 21100 }, { "epoch": 9.950495049504951, "grad_norm": 0.6891734004020691, "learning_rate": 2.95557869996206e-05, "loss": 0.1524, "num_input_tokens_seen": 21234752, "step": 21105 }, { "epoch": 9.952852428099952, "grad_norm": 1.0363835096359253, "learning_rate": 2.9545672842132273e-05, "loss": 0.1367, "num_input_tokens_seen": 21240192, "step": 21110 }, { "epoch": 9.955209806694956, "grad_norm": 0.5632779598236084, "learning_rate": 2.953555791513871e-05, "loss": 0.1459, "num_input_tokens_seen": 21244608, "step": 21115 }, { "epoch": 9.957567185289957, "grad_norm": 0.2580971121788025, "learning_rate": 2.9525442220352208e-05, "loss": 0.0826, "num_input_tokens_seen": 21251680, "step": 21120 }, { "epoch": 9.95992456388496, "grad_norm": 0.493814080953598, "learning_rate": 2.9515325759485167e-05, "loss": 0.024, "num_input_tokens_seen": 21257600, "step": 21125 }, { "epoch": 9.962281942479962, "grad_norm": 1.5608686208724976, "learning_rate": 2.950520853425015e-05, "loss": 0.0902, "num_input_tokens_seen": 21262336, "step": 21130 }, { "epoch": 9.964639321074964, "grad_norm": 0.159124493598938, "learning_rate": 2.9495090546359816e-05, "loss": 0.0567, "num_input_tokens_seen": 21267136, "step": 21135 }, { "epoch": 9.966996699669966, "grad_norm": 0.6941871047019958, "learning_rate": 2.9484971797526972e-05, "loss": 0.0401, "num_input_tokens_seen": 21271872, "step": 21140 }, { "epoch": 9.969354078264969, "grad_norm": 0.38066932559013367, "learning_rate": 2.947485228946456e-05, "loss": 0.0665, "num_input_tokens_seen": 21277056, "step": 21145 }, { "epoch": 9.971711456859971, "grad_norm": 0.273594468832016, "learning_rate": 2.946473202388563e-05, "loss": 0.106, "num_input_tokens_seen": 21281888, "step": 21150 }, { "epoch": 9.974068835454974, "grad_norm": 0.44032177329063416, "learning_rate": 2.945461100250338e-05, "loss": 0.0906, "num_input_tokens_seen": 21288128, "step": 21155 }, { "epoch": 9.976426214049976, "grad_norm": 0.7106378078460693, "learning_rate": 2.9444489227031124e-05, "loss": 0.1743, "num_input_tokens_seen": 21293280, "step": 21160 }, { "epoch": 9.978783592644978, "grad_norm": 0.6812297105789185, "learning_rate": 2.9434366699182297e-05, "loss": 0.0838, "num_input_tokens_seen": 21298880, "step": 21165 }, { "epoch": 9.98114097123998, "grad_norm": 1.1284959316253662, "learning_rate": 2.9424243420670483e-05, "loss": 0.0944, "num_input_tokens_seen": 21304480, "step": 21170 }, { "epoch": 9.983498349834983, "grad_norm": 1.5696073770523071, "learning_rate": 2.9414119393209376e-05, "loss": 0.0871, "num_input_tokens_seen": 21309120, "step": 21175 }, { "epoch": 9.985855728429986, "grad_norm": 0.8999943137168884, "learning_rate": 2.9403994618512805e-05, "loss": 0.0589, "num_input_tokens_seen": 21314048, "step": 21180 }, { "epoch": 9.988213107024988, "grad_norm": 0.05680607259273529, "learning_rate": 2.939386909829472e-05, "loss": 0.1725, "num_input_tokens_seen": 21319552, "step": 21185 }, { "epoch": 9.99057048561999, "grad_norm": 0.25359591841697693, "learning_rate": 2.9383742834269197e-05, "loss": 0.0354, "num_input_tokens_seen": 21323808, "step": 21190 }, { "epoch": 9.992927864214993, "grad_norm": 0.12531320750713348, "learning_rate": 2.9373615828150452e-05, "loss": 0.0907, "num_input_tokens_seen": 21329056, "step": 21195 }, { "epoch": 9.995285242809995, "grad_norm": 0.4320329427719116, "learning_rate": 2.9363488081652802e-05, "loss": 0.0586, "num_input_tokens_seen": 21333280, "step": 21200 }, { "epoch": 9.997642621404998, "grad_norm": 1.4456697702407837, "learning_rate": 2.9353359596490705e-05, "loss": 0.2289, "num_input_tokens_seen": 21337632, "step": 21205 }, { "epoch": 10.0, "grad_norm": 1.2886745929718018, "learning_rate": 2.9343230374378743e-05, "loss": 0.0432, "num_input_tokens_seen": 21342336, "step": 21210 }, { "epoch": 10.0, "eval_loss": 0.15169008076190948, "eval_runtime": 15.1361, "eval_samples_per_second": 62.302, "eval_steps_per_second": 15.592, "num_input_tokens_seen": 21342336, "step": 21210 }, { "epoch": 10.002357378595002, "grad_norm": 0.03040679357945919, "learning_rate": 2.933310041703163e-05, "loss": 0.2246, "num_input_tokens_seen": 21348032, "step": 21215 }, { "epoch": 10.004714757190005, "grad_norm": 0.045899417251348495, "learning_rate": 2.9322969726164184e-05, "loss": 0.1716, "num_input_tokens_seen": 21353856, "step": 21220 }, { "epoch": 10.007072135785007, "grad_norm": 0.7273041605949402, "learning_rate": 2.931283830349136e-05, "loss": 0.1227, "num_input_tokens_seen": 21360096, "step": 21225 }, { "epoch": 10.00942951438001, "grad_norm": 1.3537921905517578, "learning_rate": 2.9302706150728242e-05, "loss": 0.2034, "num_input_tokens_seen": 21365952, "step": 21230 }, { "epoch": 10.011786892975012, "grad_norm": 0.29294857382774353, "learning_rate": 2.929257326959003e-05, "loss": 0.1473, "num_input_tokens_seen": 21370528, "step": 21235 }, { "epoch": 10.014144271570014, "grad_norm": 0.6745465993881226, "learning_rate": 2.928243966179205e-05, "loss": 0.0987, "num_input_tokens_seen": 21376384, "step": 21240 }, { "epoch": 10.016501650165017, "grad_norm": 1.2620060443878174, "learning_rate": 2.9272305329049736e-05, "loss": 0.1006, "num_input_tokens_seen": 21381056, "step": 21245 }, { "epoch": 10.01885902876002, "grad_norm": 0.189298614859581, "learning_rate": 2.9262170273078676e-05, "loss": 0.0829, "num_input_tokens_seen": 21385344, "step": 21250 }, { "epoch": 10.021216407355022, "grad_norm": 0.012219741940498352, "learning_rate": 2.925203449559456e-05, "loss": 0.031, "num_input_tokens_seen": 21390784, "step": 21255 }, { "epoch": 10.023573785950024, "grad_norm": 2.806126117706299, "learning_rate": 2.9241897998313195e-05, "loss": 0.1632, "num_input_tokens_seen": 21395200, "step": 21260 }, { "epoch": 10.025931164545026, "grad_norm": 0.18767639994621277, "learning_rate": 2.9231760782950525e-05, "loss": 0.0696, "num_input_tokens_seen": 21400512, "step": 21265 }, { "epoch": 10.028288543140029, "grad_norm": 1.1218218803405762, "learning_rate": 2.92216228512226e-05, "loss": 0.2323, "num_input_tokens_seen": 21405792, "step": 21270 }, { "epoch": 10.030645921735031, "grad_norm": 1.0355644226074219, "learning_rate": 2.9211484204845617e-05, "loss": 0.1555, "num_input_tokens_seen": 21413696, "step": 21275 }, { "epoch": 10.033003300330034, "grad_norm": 0.301609069108963, "learning_rate": 2.9201344845535862e-05, "loss": 0.2099, "num_input_tokens_seen": 21418400, "step": 21280 }, { "epoch": 10.035360678925036, "grad_norm": 0.7297180891036987, "learning_rate": 2.919120477500975e-05, "loss": 0.1004, "num_input_tokens_seen": 21423520, "step": 21285 }, { "epoch": 10.037718057520038, "grad_norm": 0.9124301075935364, "learning_rate": 2.9181063994983836e-05, "loss": 0.1882, "num_input_tokens_seen": 21428512, "step": 21290 }, { "epoch": 10.04007543611504, "grad_norm": 0.23569144308567047, "learning_rate": 2.917092250717478e-05, "loss": 0.0786, "num_input_tokens_seen": 21433664, "step": 21295 }, { "epoch": 10.042432814710043, "grad_norm": 1.0095884799957275, "learning_rate": 2.9160780313299363e-05, "loss": 0.0376, "num_input_tokens_seen": 21438624, "step": 21300 }, { "epoch": 10.044790193305046, "grad_norm": 0.6041002869606018, "learning_rate": 2.915063741507448e-05, "loss": 0.0651, "num_input_tokens_seen": 21443872, "step": 21305 }, { "epoch": 10.047147571900048, "grad_norm": 0.5047493577003479, "learning_rate": 2.9140493814217157e-05, "loss": 0.0269, "num_input_tokens_seen": 21448704, "step": 21310 }, { "epoch": 10.049504950495049, "grad_norm": 0.7647037506103516, "learning_rate": 2.9130349512444544e-05, "loss": 0.1096, "num_input_tokens_seen": 21453664, "step": 21315 }, { "epoch": 10.051862329090051, "grad_norm": 0.14741624891757965, "learning_rate": 2.9120204511473876e-05, "loss": 0.1962, "num_input_tokens_seen": 21458400, "step": 21320 }, { "epoch": 10.054219707685053, "grad_norm": 1.5521478652954102, "learning_rate": 2.911005881302254e-05, "loss": 0.1231, "num_input_tokens_seen": 21464032, "step": 21325 }, { "epoch": 10.056577086280056, "grad_norm": 0.17218200862407684, "learning_rate": 2.9099912418808033e-05, "loss": 0.0313, "num_input_tokens_seen": 21468992, "step": 21330 }, { "epoch": 10.058934464875058, "grad_norm": 2.2769386768341064, "learning_rate": 2.9089765330547975e-05, "loss": 0.1632, "num_input_tokens_seen": 21473856, "step": 21335 }, { "epoch": 10.06129184347006, "grad_norm": 0.1839546412229538, "learning_rate": 2.907961754996008e-05, "loss": 0.1128, "num_input_tokens_seen": 21479808, "step": 21340 }, { "epoch": 10.063649222065063, "grad_norm": 0.8314381241798401, "learning_rate": 2.90694690787622e-05, "loss": 0.0482, "num_input_tokens_seen": 21485440, "step": 21345 }, { "epoch": 10.066006600660065, "grad_norm": 0.8473315238952637, "learning_rate": 2.90593199186723e-05, "loss": 0.0351, "num_input_tokens_seen": 21490624, "step": 21350 }, { "epoch": 10.068363979255068, "grad_norm": 0.2949684262275696, "learning_rate": 2.9049170071408466e-05, "loss": 0.0949, "num_input_tokens_seen": 21495424, "step": 21355 }, { "epoch": 10.07072135785007, "grad_norm": 2.6726491451263428, "learning_rate": 2.9039019538688895e-05, "loss": 0.0871, "num_input_tokens_seen": 21500160, "step": 21360 }, { "epoch": 10.073078736445073, "grad_norm": 0.812396764755249, "learning_rate": 2.9028868322231884e-05, "loss": 0.0567, "num_input_tokens_seen": 21505696, "step": 21365 }, { "epoch": 10.075436115040075, "grad_norm": 0.7550054788589478, "learning_rate": 2.9018716423755877e-05, "loss": 0.2149, "num_input_tokens_seen": 21511232, "step": 21370 }, { "epoch": 10.077793493635077, "grad_norm": 0.7013345956802368, "learning_rate": 2.9008563844979408e-05, "loss": 0.1439, "num_input_tokens_seen": 21516256, "step": 21375 }, { "epoch": 10.08015087223008, "grad_norm": 1.5454598665237427, "learning_rate": 2.899841058762115e-05, "loss": 0.2781, "num_input_tokens_seen": 21521568, "step": 21380 }, { "epoch": 10.082508250825082, "grad_norm": 1.0403220653533936, "learning_rate": 2.898825665339987e-05, "loss": 0.2595, "num_input_tokens_seen": 21528320, "step": 21385 }, { "epoch": 10.084865629420085, "grad_norm": 1.1200464963912964, "learning_rate": 2.897810204403445e-05, "loss": 0.1523, "num_input_tokens_seen": 21533216, "step": 21390 }, { "epoch": 10.087223008015087, "grad_norm": 0.3924318850040436, "learning_rate": 2.89679467612439e-05, "loss": 0.1324, "num_input_tokens_seen": 21539552, "step": 21395 }, { "epoch": 10.08958038661009, "grad_norm": 0.2527238130569458, "learning_rate": 2.8957790806747337e-05, "loss": 0.1056, "num_input_tokens_seen": 21544128, "step": 21400 }, { "epoch": 10.091937765205092, "grad_norm": 0.3746257424354553, "learning_rate": 2.8947634182263984e-05, "loss": 0.1545, "num_input_tokens_seen": 21551232, "step": 21405 }, { "epoch": 10.094295143800094, "grad_norm": 1.9967725276947021, "learning_rate": 2.8937476889513194e-05, "loss": 0.293, "num_input_tokens_seen": 21556256, "step": 21410 }, { "epoch": 10.096652522395097, "grad_norm": 0.68394935131073, "learning_rate": 2.8927318930214414e-05, "loss": 0.0999, "num_input_tokens_seen": 21561184, "step": 21415 }, { "epoch": 10.099009900990099, "grad_norm": 0.5627833604812622, "learning_rate": 2.8917160306087233e-05, "loss": 0.1105, "num_input_tokens_seen": 21565888, "step": 21420 }, { "epoch": 10.101367279585101, "grad_norm": 2.0335261821746826, "learning_rate": 2.8907001018851317e-05, "loss": 0.1682, "num_input_tokens_seen": 21571712, "step": 21425 }, { "epoch": 10.103724658180104, "grad_norm": 1.694705605506897, "learning_rate": 2.8896841070226465e-05, "loss": 0.1516, "num_input_tokens_seen": 21577120, "step": 21430 }, { "epoch": 10.106082036775106, "grad_norm": 1.3798537254333496, "learning_rate": 2.888668046193258e-05, "loss": 0.3188, "num_input_tokens_seen": 21582496, "step": 21435 }, { "epoch": 10.108439415370109, "grad_norm": 2.984701156616211, "learning_rate": 2.8876519195689696e-05, "loss": 0.1468, "num_input_tokens_seen": 21587200, "step": 21440 }, { "epoch": 10.110796793965111, "grad_norm": 0.5619177222251892, "learning_rate": 2.8866357273217925e-05, "loss": 0.0438, "num_input_tokens_seen": 21592160, "step": 21445 }, { "epoch": 10.113154172560114, "grad_norm": 0.13696125149726868, "learning_rate": 2.8856194696237515e-05, "loss": 0.0476, "num_input_tokens_seen": 21597344, "step": 21450 }, { "epoch": 10.115511551155116, "grad_norm": 1.30062997341156, "learning_rate": 2.8846031466468816e-05, "loss": 0.091, "num_input_tokens_seen": 21602176, "step": 21455 }, { "epoch": 10.117868929750118, "grad_norm": 0.7673304677009583, "learning_rate": 2.8835867585632294e-05, "loss": 0.096, "num_input_tokens_seen": 21605984, "step": 21460 }, { "epoch": 10.12022630834512, "grad_norm": 0.31912896037101746, "learning_rate": 2.8825703055448517e-05, "loss": 0.1575, "num_input_tokens_seen": 21612096, "step": 21465 }, { "epoch": 10.122583686940123, "grad_norm": 0.9308834671974182, "learning_rate": 2.8815537877638172e-05, "loss": 0.1231, "num_input_tokens_seen": 21617600, "step": 21470 }, { "epoch": 10.124941065535126, "grad_norm": 0.15350711345672607, "learning_rate": 2.8805372053922043e-05, "loss": 0.109, "num_input_tokens_seen": 21624576, "step": 21475 }, { "epoch": 10.127298444130128, "grad_norm": 0.4128985106945038, "learning_rate": 2.8795205586021044e-05, "loss": 0.0656, "num_input_tokens_seen": 21630048, "step": 21480 }, { "epoch": 10.12965582272513, "grad_norm": 0.08821775764226913, "learning_rate": 2.8785038475656173e-05, "loss": 0.0469, "num_input_tokens_seen": 21634752, "step": 21485 }, { "epoch": 10.132013201320133, "grad_norm": 1.9219011068344116, "learning_rate": 2.8774870724548557e-05, "loss": 0.1039, "num_input_tokens_seen": 21639840, "step": 21490 }, { "epoch": 10.134370579915135, "grad_norm": 0.0851312205195427, "learning_rate": 2.876470233441942e-05, "loss": 0.0216, "num_input_tokens_seen": 21644352, "step": 21495 }, { "epoch": 10.136727958510138, "grad_norm": 1.1185399293899536, "learning_rate": 2.87545333069901e-05, "loss": 0.16, "num_input_tokens_seen": 21649184, "step": 21500 }, { "epoch": 10.13908533710514, "grad_norm": 0.9250165820121765, "learning_rate": 2.874436364398204e-05, "loss": 0.1049, "num_input_tokens_seen": 21653632, "step": 21505 }, { "epoch": 10.14144271570014, "grad_norm": 0.8175772428512573, "learning_rate": 2.8734193347116793e-05, "loss": 0.0801, "num_input_tokens_seen": 21659264, "step": 21510 }, { "epoch": 10.143800094295143, "grad_norm": 1.3184890747070312, "learning_rate": 2.872402241811601e-05, "loss": 0.1449, "num_input_tokens_seen": 21664224, "step": 21515 }, { "epoch": 10.146157472890145, "grad_norm": 0.08629105240106583, "learning_rate": 2.8713850858701463e-05, "loss": 0.1851, "num_input_tokens_seen": 21669952, "step": 21520 }, { "epoch": 10.148514851485148, "grad_norm": 0.3194160461425781, "learning_rate": 2.8703678670595023e-05, "loss": 0.1505, "num_input_tokens_seen": 21674432, "step": 21525 }, { "epoch": 10.15087223008015, "grad_norm": 0.35314270853996277, "learning_rate": 2.8693505855518672e-05, "loss": 0.2027, "num_input_tokens_seen": 21680352, "step": 21530 }, { "epoch": 10.153229608675153, "grad_norm": 0.201555073261261, "learning_rate": 2.8683332415194485e-05, "loss": 0.0889, "num_input_tokens_seen": 21685600, "step": 21535 }, { "epoch": 10.155586987270155, "grad_norm": 0.2349451780319214, "learning_rate": 2.8673158351344666e-05, "loss": 0.1461, "num_input_tokens_seen": 21690912, "step": 21540 }, { "epoch": 10.157944365865157, "grad_norm": 0.07591943442821503, "learning_rate": 2.8662983665691508e-05, "loss": 0.1025, "num_input_tokens_seen": 21695488, "step": 21545 }, { "epoch": 10.16030174446016, "grad_norm": 0.22941255569458008, "learning_rate": 2.86528083599574e-05, "loss": 0.0394, "num_input_tokens_seen": 21700352, "step": 21550 }, { "epoch": 10.162659123055162, "grad_norm": 0.2367020547389984, "learning_rate": 2.8642632435864862e-05, "loss": 0.2241, "num_input_tokens_seen": 21708160, "step": 21555 }, { "epoch": 10.165016501650165, "grad_norm": 0.2825816869735718, "learning_rate": 2.86324558951365e-05, "loss": 0.0339, "num_input_tokens_seen": 21712448, "step": 21560 }, { "epoch": 10.167373880245167, "grad_norm": 0.1471102088689804, "learning_rate": 2.8622278739495035e-05, "loss": 0.0705, "num_input_tokens_seen": 21717184, "step": 21565 }, { "epoch": 10.16973125884017, "grad_norm": 0.03634290024638176, "learning_rate": 2.861210097066328e-05, "loss": 0.073, "num_input_tokens_seen": 21723456, "step": 21570 }, { "epoch": 10.172088637435172, "grad_norm": 0.7211495041847229, "learning_rate": 2.8601922590364154e-05, "loss": 0.1005, "num_input_tokens_seen": 21730048, "step": 21575 }, { "epoch": 10.174446016030174, "grad_norm": 0.08041857928037643, "learning_rate": 2.8591743600320697e-05, "loss": 0.1536, "num_input_tokens_seen": 21734848, "step": 21580 }, { "epoch": 10.176803394625177, "grad_norm": 0.3460625410079956, "learning_rate": 2.8581564002256034e-05, "loss": 0.0877, "num_input_tokens_seen": 21739584, "step": 21585 }, { "epoch": 10.179160773220179, "grad_norm": 0.23792259395122528, "learning_rate": 2.8571383797893397e-05, "loss": 0.0617, "num_input_tokens_seen": 21744384, "step": 21590 }, { "epoch": 10.181518151815181, "grad_norm": 0.32641157507896423, "learning_rate": 2.8561202988956115e-05, "loss": 0.111, "num_input_tokens_seen": 21749920, "step": 21595 }, { "epoch": 10.183875530410184, "grad_norm": 0.42873936891555786, "learning_rate": 2.8551021577167636e-05, "loss": 0.0765, "num_input_tokens_seen": 21754176, "step": 21600 }, { "epoch": 10.186232909005186, "grad_norm": 0.7288914918899536, "learning_rate": 2.8540839564251502e-05, "loss": 0.1733, "num_input_tokens_seen": 21759712, "step": 21605 }, { "epoch": 10.188590287600189, "grad_norm": 1.8334622383117676, "learning_rate": 2.853065695193135e-05, "loss": 0.1566, "num_input_tokens_seen": 21764256, "step": 21610 }, { "epoch": 10.190947666195191, "grad_norm": 0.5621520280838013, "learning_rate": 2.852047374193092e-05, "loss": 0.0759, "num_input_tokens_seen": 21768704, "step": 21615 }, { "epoch": 10.193305044790193, "grad_norm": 0.7993398308753967, "learning_rate": 2.8510289935974057e-05, "loss": 0.0609, "num_input_tokens_seen": 21773728, "step": 21620 }, { "epoch": 10.195662423385196, "grad_norm": 0.3729739487171173, "learning_rate": 2.850010553578471e-05, "loss": 0.134, "num_input_tokens_seen": 21778784, "step": 21625 }, { "epoch": 10.198019801980198, "grad_norm": 0.6948964595794678, "learning_rate": 2.848992054308693e-05, "loss": 0.1385, "num_input_tokens_seen": 21784704, "step": 21630 }, { "epoch": 10.2003771805752, "grad_norm": 0.20941410958766937, "learning_rate": 2.8479734959604846e-05, "loss": 0.0463, "num_input_tokens_seen": 21789280, "step": 21635 }, { "epoch": 10.202734559170203, "grad_norm": 0.03052421100437641, "learning_rate": 2.8469548787062727e-05, "loss": 0.1056, "num_input_tokens_seen": 21792992, "step": 21640 }, { "epoch": 10.205091937765205, "grad_norm": 0.022636746987700462, "learning_rate": 2.84593620271849e-05, "loss": 0.095, "num_input_tokens_seen": 21797696, "step": 21645 }, { "epoch": 10.207449316360208, "grad_norm": 0.4768272042274475, "learning_rate": 2.8449174681695827e-05, "loss": 0.1014, "num_input_tokens_seen": 21802208, "step": 21650 }, { "epoch": 10.20980669495521, "grad_norm": 0.5668952465057373, "learning_rate": 2.8438986752320035e-05, "loss": 0.0417, "num_input_tokens_seen": 21807168, "step": 21655 }, { "epoch": 10.212164073550213, "grad_norm": 0.24474960565567017, "learning_rate": 2.8428798240782182e-05, "loss": 0.1046, "num_input_tokens_seen": 21812512, "step": 21660 }, { "epoch": 10.214521452145215, "grad_norm": 2.3093605041503906, "learning_rate": 2.8418609148807003e-05, "loss": 0.173, "num_input_tokens_seen": 21817728, "step": 21665 }, { "epoch": 10.216878830740217, "grad_norm": 0.442828506231308, "learning_rate": 2.840841947811934e-05, "loss": 0.0603, "num_input_tokens_seen": 21822464, "step": 21670 }, { "epoch": 10.21923620933522, "grad_norm": 0.03850135952234268, "learning_rate": 2.839822923044413e-05, "loss": 0.049, "num_input_tokens_seen": 21827296, "step": 21675 }, { "epoch": 10.221593587930222, "grad_norm": 0.9957615733146667, "learning_rate": 2.8388038407506413e-05, "loss": 0.0974, "num_input_tokens_seen": 21831456, "step": 21680 }, { "epoch": 10.223950966525225, "grad_norm": 1.9396318197250366, "learning_rate": 2.837784701103131e-05, "loss": 0.0657, "num_input_tokens_seen": 21836640, "step": 21685 }, { "epoch": 10.226308345120227, "grad_norm": 0.17807359993457794, "learning_rate": 2.8367655042744074e-05, "loss": 0.0236, "num_input_tokens_seen": 21841696, "step": 21690 }, { "epoch": 10.22866572371523, "grad_norm": 0.1632630079984665, "learning_rate": 2.8357462504370014e-05, "loss": 0.075, "num_input_tokens_seen": 21846176, "step": 21695 }, { "epoch": 10.231023102310232, "grad_norm": 1.0520175695419312, "learning_rate": 2.8347269397634553e-05, "loss": 0.1099, "num_input_tokens_seen": 21851872, "step": 21700 }, { "epoch": 10.233380480905234, "grad_norm": 0.30367353558540344, "learning_rate": 2.8337075724263218e-05, "loss": 0.1254, "num_input_tokens_seen": 21856896, "step": 21705 }, { "epoch": 10.235737859500237, "grad_norm": 0.03550904616713524, "learning_rate": 2.8326881485981633e-05, "loss": 0.0704, "num_input_tokens_seen": 21864064, "step": 21710 }, { "epoch": 10.238095238095237, "grad_norm": 0.02639899216592312, "learning_rate": 2.8316686684515488e-05, "loss": 0.0336, "num_input_tokens_seen": 21868608, "step": 21715 }, { "epoch": 10.24045261669024, "grad_norm": 1.3433181047439575, "learning_rate": 2.8306491321590607e-05, "loss": 0.1785, "num_input_tokens_seen": 21873856, "step": 21720 }, { "epoch": 10.242809995285242, "grad_norm": 0.6385334730148315, "learning_rate": 2.8296295398932886e-05, "loss": 0.1683, "num_input_tokens_seen": 21879424, "step": 21725 }, { "epoch": 10.245167373880244, "grad_norm": 0.1325293630361557, "learning_rate": 2.828609891826832e-05, "loss": 0.0196, "num_input_tokens_seen": 21883968, "step": 21730 }, { "epoch": 10.247524752475247, "grad_norm": 0.23491372168064117, "learning_rate": 2.8275901881323007e-05, "loss": 0.0456, "num_input_tokens_seen": 21888768, "step": 21735 }, { "epoch": 10.24988213107025, "grad_norm": 0.8841637969017029, "learning_rate": 2.8265704289823118e-05, "loss": 0.1209, "num_input_tokens_seen": 21893056, "step": 21740 }, { "epoch": 10.252239509665252, "grad_norm": 2.143772602081299, "learning_rate": 2.8255506145494948e-05, "loss": 0.0679, "num_input_tokens_seen": 21898208, "step": 21745 }, { "epoch": 10.254596888260254, "grad_norm": 1.319334626197815, "learning_rate": 2.8245307450064863e-05, "loss": 0.1035, "num_input_tokens_seen": 21902368, "step": 21750 }, { "epoch": 10.256954266855256, "grad_norm": 0.11816965788602829, "learning_rate": 2.8235108205259324e-05, "loss": 0.088, "num_input_tokens_seen": 21906816, "step": 21755 }, { "epoch": 10.259311645450259, "grad_norm": 0.2779937982559204, "learning_rate": 2.8224908412804895e-05, "loss": 0.187, "num_input_tokens_seen": 21911616, "step": 21760 }, { "epoch": 10.261669024045261, "grad_norm": 1.609079122543335, "learning_rate": 2.821470807442823e-05, "loss": 0.1301, "num_input_tokens_seen": 21916192, "step": 21765 }, { "epoch": 10.264026402640264, "grad_norm": 1.1411446332931519, "learning_rate": 2.8204507191856073e-05, "loss": 0.2156, "num_input_tokens_seen": 21920256, "step": 21770 }, { "epoch": 10.266383781235266, "grad_norm": 1.241363763809204, "learning_rate": 2.819430576681526e-05, "loss": 0.1895, "num_input_tokens_seen": 21924864, "step": 21775 }, { "epoch": 10.268741159830268, "grad_norm": 0.7322894334793091, "learning_rate": 2.8184103801032714e-05, "loss": 0.0935, "num_input_tokens_seen": 21930464, "step": 21780 }, { "epoch": 10.27109853842527, "grad_norm": 0.5912176966667175, "learning_rate": 2.817390129623545e-05, "loss": 0.0818, "num_input_tokens_seen": 21936160, "step": 21785 }, { "epoch": 10.273455917020273, "grad_norm": 1.9108314514160156, "learning_rate": 2.8163698254150593e-05, "loss": 0.0897, "num_input_tokens_seen": 21941184, "step": 21790 }, { "epoch": 10.275813295615276, "grad_norm": 3.759701728820801, "learning_rate": 2.8153494676505345e-05, "loss": 0.0583, "num_input_tokens_seen": 21945472, "step": 21795 }, { "epoch": 10.278170674210278, "grad_norm": 1.1646275520324707, "learning_rate": 2.814329056502698e-05, "loss": 0.1135, "num_input_tokens_seen": 21950496, "step": 21800 }, { "epoch": 10.28052805280528, "grad_norm": 0.30804890394210815, "learning_rate": 2.8133085921442893e-05, "loss": 0.0635, "num_input_tokens_seen": 21956000, "step": 21805 }, { "epoch": 10.282885431400283, "grad_norm": 0.08236134797334671, "learning_rate": 2.8122880747480555e-05, "loss": 0.0851, "num_input_tokens_seen": 21962624, "step": 21810 }, { "epoch": 10.285242809995285, "grad_norm": 0.0495530404150486, "learning_rate": 2.8112675044867543e-05, "loss": 0.2442, "num_input_tokens_seen": 21967680, "step": 21815 }, { "epoch": 10.287600188590288, "grad_norm": 1.1225858926773071, "learning_rate": 2.8102468815331485e-05, "loss": 0.0798, "num_input_tokens_seen": 21972608, "step": 21820 }, { "epoch": 10.28995756718529, "grad_norm": 0.5817811489105225, "learning_rate": 2.8092262060600133e-05, "loss": 0.0347, "num_input_tokens_seen": 21978624, "step": 21825 }, { "epoch": 10.292314945780292, "grad_norm": 0.4616939425468445, "learning_rate": 2.8082054782401322e-05, "loss": 0.0207, "num_input_tokens_seen": 21984256, "step": 21830 }, { "epoch": 10.294672324375295, "grad_norm": 0.09564992040395737, "learning_rate": 2.807184698246297e-05, "loss": 0.0528, "num_input_tokens_seen": 21989760, "step": 21835 }, { "epoch": 10.297029702970297, "grad_norm": 0.3673516511917114, "learning_rate": 2.8061638662513077e-05, "loss": 0.0196, "num_input_tokens_seen": 21994144, "step": 21840 }, { "epoch": 10.2993870815653, "grad_norm": 1.2969605922698975, "learning_rate": 2.8051429824279745e-05, "loss": 0.0345, "num_input_tokens_seen": 22000512, "step": 21845 }, { "epoch": 10.301744460160302, "grad_norm": 2.325138807296753, "learning_rate": 2.804122046949115e-05, "loss": 0.1514, "num_input_tokens_seen": 22005952, "step": 21850 }, { "epoch": 10.304101838755304, "grad_norm": 0.5630825757980347, "learning_rate": 2.803101059987556e-05, "loss": 0.0484, "num_input_tokens_seen": 22010560, "step": 21855 }, { "epoch": 10.306459217350307, "grad_norm": 0.2907162606716156, "learning_rate": 2.8020800217161353e-05, "loss": 0.0801, "num_input_tokens_seen": 22015648, "step": 21860 }, { "epoch": 10.30881659594531, "grad_norm": 1.187279224395752, "learning_rate": 2.8010589323076952e-05, "loss": 0.3151, "num_input_tokens_seen": 22020320, "step": 21865 }, { "epoch": 10.311173974540312, "grad_norm": 0.06520181894302368, "learning_rate": 2.8000377919350897e-05, "loss": 0.1504, "num_input_tokens_seen": 22025280, "step": 21870 }, { "epoch": 10.313531353135314, "grad_norm": 1.0109916925430298, "learning_rate": 2.79901660077118e-05, "loss": 0.1017, "num_input_tokens_seen": 22031744, "step": 21875 }, { "epoch": 10.315888731730317, "grad_norm": 0.06039735674858093, "learning_rate": 2.7979953589888376e-05, "loss": 0.0563, "num_input_tokens_seen": 22036832, "step": 21880 }, { "epoch": 10.318246110325319, "grad_norm": 0.19131916761398315, "learning_rate": 2.79697406676094e-05, "loss": 0.1075, "num_input_tokens_seen": 22042144, "step": 21885 }, { "epoch": 10.320603488920321, "grad_norm": 1.4224241971969604, "learning_rate": 2.795952724260375e-05, "loss": 0.136, "num_input_tokens_seen": 22046720, "step": 21890 }, { "epoch": 10.322960867515324, "grad_norm": 2.7539846897125244, "learning_rate": 2.794931331660039e-05, "loss": 0.2172, "num_input_tokens_seen": 22052992, "step": 21895 }, { "epoch": 10.325318246110326, "grad_norm": 0.2350255250930786, "learning_rate": 2.793909889132837e-05, "loss": 0.1307, "num_input_tokens_seen": 22057088, "step": 21900 }, { "epoch": 10.327675624705329, "grad_norm": 0.01292362716048956, "learning_rate": 2.79288839685168e-05, "loss": 0.0502, "num_input_tokens_seen": 22061120, "step": 21905 }, { "epoch": 10.33003300330033, "grad_norm": 0.292108952999115, "learning_rate": 2.7918668549894904e-05, "loss": 0.0955, "num_input_tokens_seen": 22066048, "step": 21910 }, { "epoch": 10.332390381895332, "grad_norm": 0.20983760058879852, "learning_rate": 2.7908452637191984e-05, "loss": 0.0253, "num_input_tokens_seen": 22070560, "step": 21915 }, { "epoch": 10.334747760490334, "grad_norm": 0.1493871510028839, "learning_rate": 2.7898236232137414e-05, "loss": 0.0944, "num_input_tokens_seen": 22075232, "step": 21920 }, { "epoch": 10.337105139085336, "grad_norm": 0.36546340584754944, "learning_rate": 2.7888019336460657e-05, "loss": 0.1121, "num_input_tokens_seen": 22081696, "step": 21925 }, { "epoch": 10.339462517680339, "grad_norm": 0.5265887975692749, "learning_rate": 2.7877801951891264e-05, "loss": 0.1441, "num_input_tokens_seen": 22086432, "step": 21930 }, { "epoch": 10.341819896275341, "grad_norm": 0.5721399188041687, "learning_rate": 2.7867584080158865e-05, "loss": 0.0758, "num_input_tokens_seen": 22092320, "step": 21935 }, { "epoch": 10.344177274870344, "grad_norm": 0.29127952456474304, "learning_rate": 2.7857365722993168e-05, "loss": 0.1616, "num_input_tokens_seen": 22096192, "step": 21940 }, { "epoch": 10.346534653465346, "grad_norm": 1.2327581644058228, "learning_rate": 2.784714688212397e-05, "loss": 0.4166, "num_input_tokens_seen": 22101120, "step": 21945 }, { "epoch": 10.348892032060348, "grad_norm": 0.16519689559936523, "learning_rate": 2.7836927559281152e-05, "loss": 0.1213, "num_input_tokens_seen": 22105152, "step": 21950 }, { "epoch": 10.35124941065535, "grad_norm": 2.0531792640686035, "learning_rate": 2.782670775619466e-05, "loss": 0.065, "num_input_tokens_seen": 22110048, "step": 21955 }, { "epoch": 10.353606789250353, "grad_norm": 0.08592478930950165, "learning_rate": 2.7816487474594534e-05, "loss": 0.1021, "num_input_tokens_seen": 22113568, "step": 21960 }, { "epoch": 10.355964167845356, "grad_norm": 0.36038434505462646, "learning_rate": 2.780626671621091e-05, "loss": 0.0519, "num_input_tokens_seen": 22118976, "step": 21965 }, { "epoch": 10.358321546440358, "grad_norm": 0.02445489726960659, "learning_rate": 2.7796045482773976e-05, "loss": 0.0859, "num_input_tokens_seen": 22126240, "step": 21970 }, { "epoch": 10.36067892503536, "grad_norm": 0.12070591747760773, "learning_rate": 2.778582377601401e-05, "loss": 0.129, "num_input_tokens_seen": 22131936, "step": 21975 }, { "epoch": 10.363036303630363, "grad_norm": 1.8712049722671509, "learning_rate": 2.7775601597661384e-05, "loss": 0.1455, "num_input_tokens_seen": 22136704, "step": 21980 }, { "epoch": 10.365393682225365, "grad_norm": 0.14551855623722076, "learning_rate": 2.7765378949446535e-05, "loss": 0.1654, "num_input_tokens_seen": 22141024, "step": 21985 }, { "epoch": 10.367751060820368, "grad_norm": 1.558403730392456, "learning_rate": 2.7755155833099982e-05, "loss": 0.3227, "num_input_tokens_seen": 22146592, "step": 21990 }, { "epoch": 10.37010843941537, "grad_norm": 1.2453678846359253, "learning_rate": 2.7744932250352324e-05, "loss": 0.0497, "num_input_tokens_seen": 22151904, "step": 21995 }, { "epoch": 10.372465818010372, "grad_norm": 0.6403395533561707, "learning_rate": 2.7734708202934247e-05, "loss": 0.0252, "num_input_tokens_seen": 22155840, "step": 22000 }, { "epoch": 10.374823196605375, "grad_norm": 1.0583102703094482, "learning_rate": 2.77244836925765e-05, "loss": 0.1912, "num_input_tokens_seen": 22160544, "step": 22005 }, { "epoch": 10.377180575200377, "grad_norm": 1.118601679801941, "learning_rate": 2.7714258721009927e-05, "loss": 0.3668, "num_input_tokens_seen": 22165152, "step": 22010 }, { "epoch": 10.37953795379538, "grad_norm": 0.2364761382341385, "learning_rate": 2.7704033289965437e-05, "loss": 0.2036, "num_input_tokens_seen": 22169728, "step": 22015 }, { "epoch": 10.381895332390382, "grad_norm": 2.708138942718506, "learning_rate": 2.7693807401174023e-05, "loss": 0.1491, "num_input_tokens_seen": 22173984, "step": 22020 }, { "epoch": 10.384252710985384, "grad_norm": 1.6684291362762451, "learning_rate": 2.7683581056366755e-05, "loss": 0.2411, "num_input_tokens_seen": 22179776, "step": 22025 }, { "epoch": 10.386610089580387, "grad_norm": 1.6999708414077759, "learning_rate": 2.7673354257274775e-05, "loss": 0.2832, "num_input_tokens_seen": 22185664, "step": 22030 }, { "epoch": 10.38896746817539, "grad_norm": 0.7201277613639832, "learning_rate": 2.7663127005629306e-05, "loss": 0.1411, "num_input_tokens_seen": 22191648, "step": 22035 }, { "epoch": 10.391324846770392, "grad_norm": 0.5335602164268494, "learning_rate": 2.7652899303161662e-05, "loss": 0.2098, "num_input_tokens_seen": 22197824, "step": 22040 }, { "epoch": 10.393682225365394, "grad_norm": 0.3109687864780426, "learning_rate": 2.7642671151603207e-05, "loss": 0.0303, "num_input_tokens_seen": 22202688, "step": 22045 }, { "epoch": 10.396039603960396, "grad_norm": 0.022053856402635574, "learning_rate": 2.7632442552685395e-05, "loss": 0.0819, "num_input_tokens_seen": 22207808, "step": 22050 }, { "epoch": 10.398396982555399, "grad_norm": 2.3438773155212402, "learning_rate": 2.7622213508139754e-05, "loss": 0.0984, "num_input_tokens_seen": 22212320, "step": 22055 }, { "epoch": 10.400754361150401, "grad_norm": 0.06178446486592293, "learning_rate": 2.761198401969789e-05, "loss": 0.0171, "num_input_tokens_seen": 22217536, "step": 22060 }, { "epoch": 10.403111739745404, "grad_norm": 0.10199886560440063, "learning_rate": 2.760175408909148e-05, "loss": 0.1167, "num_input_tokens_seen": 22221472, "step": 22065 }, { "epoch": 10.405469118340406, "grad_norm": 0.5825182199478149, "learning_rate": 2.7591523718052287e-05, "loss": 0.0839, "num_input_tokens_seen": 22228064, "step": 22070 }, { "epoch": 10.407826496935408, "grad_norm": 1.2728148698806763, "learning_rate": 2.7581292908312116e-05, "loss": 0.0874, "num_input_tokens_seen": 22232544, "step": 22075 }, { "epoch": 10.41018387553041, "grad_norm": 0.10633432865142822, "learning_rate": 2.7571061661602887e-05, "loss": 0.3829, "num_input_tokens_seen": 22238016, "step": 22080 }, { "epoch": 10.412541254125413, "grad_norm": 0.19811666011810303, "learning_rate": 2.7560829979656577e-05, "loss": 0.0717, "num_input_tokens_seen": 22242976, "step": 22085 }, { "epoch": 10.414898632720416, "grad_norm": 1.0988311767578125, "learning_rate": 2.755059786420523e-05, "loss": 0.2507, "num_input_tokens_seen": 22248416, "step": 22090 }, { "epoch": 10.417256011315418, "grad_norm": 0.24165520071983337, "learning_rate": 2.754036531698097e-05, "loss": 0.0675, "num_input_tokens_seen": 22253984, "step": 22095 }, { "epoch": 10.41961338991042, "grad_norm": 0.8636582493782043, "learning_rate": 2.7530132339715998e-05, "loss": 0.21, "num_input_tokens_seen": 22260640, "step": 22100 }, { "epoch": 10.421970768505423, "grad_norm": 0.7952001690864563, "learning_rate": 2.7519898934142573e-05, "loss": 0.0613, "num_input_tokens_seen": 22265952, "step": 22105 }, { "epoch": 10.424328147100425, "grad_norm": 0.46473395824432373, "learning_rate": 2.7509665101993053e-05, "loss": 0.0864, "num_input_tokens_seen": 22270848, "step": 22110 }, { "epoch": 10.426685525695426, "grad_norm": 0.13725800812244415, "learning_rate": 2.749943084499984e-05, "loss": 0.0542, "num_input_tokens_seen": 22275552, "step": 22115 }, { "epoch": 10.429042904290428, "grad_norm": 0.5955230593681335, "learning_rate": 2.748919616489542e-05, "loss": 0.0592, "num_input_tokens_seen": 22280512, "step": 22120 }, { "epoch": 10.43140028288543, "grad_norm": 0.5332774519920349, "learning_rate": 2.747896106341235e-05, "loss": 0.0732, "num_input_tokens_seen": 22285184, "step": 22125 }, { "epoch": 10.433757661480433, "grad_norm": 1.6705478429794312, "learning_rate": 2.7468725542283275e-05, "loss": 0.1081, "num_input_tokens_seen": 22289856, "step": 22130 }, { "epoch": 10.436115040075435, "grad_norm": 0.19261960685253143, "learning_rate": 2.745848960324087e-05, "loss": 0.0731, "num_input_tokens_seen": 22294720, "step": 22135 }, { "epoch": 10.438472418670438, "grad_norm": 0.5494722723960876, "learning_rate": 2.744825324801792e-05, "loss": 0.0999, "num_input_tokens_seen": 22299328, "step": 22140 }, { "epoch": 10.44082979726544, "grad_norm": 0.07220268249511719, "learning_rate": 2.743801647834727e-05, "loss": 0.1196, "num_input_tokens_seen": 22304416, "step": 22145 }, { "epoch": 10.443187175860443, "grad_norm": 1.1807763576507568, "learning_rate": 2.7427779295961824e-05, "loss": 0.2945, "num_input_tokens_seen": 22309376, "step": 22150 }, { "epoch": 10.445544554455445, "grad_norm": 0.4167744815349579, "learning_rate": 2.7417541702594563e-05, "loss": 0.2133, "num_input_tokens_seen": 22314208, "step": 22155 }, { "epoch": 10.447901933050447, "grad_norm": 0.9282798171043396, "learning_rate": 2.740730369997853e-05, "loss": 0.0516, "num_input_tokens_seen": 22319136, "step": 22160 }, { "epoch": 10.45025931164545, "grad_norm": 0.2648831307888031, "learning_rate": 2.7397065289846868e-05, "loss": 0.3372, "num_input_tokens_seen": 22323872, "step": 22165 }, { "epoch": 10.452616690240452, "grad_norm": 2.5655739307403564, "learning_rate": 2.7386826473932747e-05, "loss": 0.1851, "num_input_tokens_seen": 22328896, "step": 22170 }, { "epoch": 10.454974068835455, "grad_norm": 2.4760918617248535, "learning_rate": 2.737658725396944e-05, "loss": 0.0628, "num_input_tokens_seen": 22333760, "step": 22175 }, { "epoch": 10.457331447430457, "grad_norm": 0.20377743244171143, "learning_rate": 2.7366347631690248e-05, "loss": 0.0543, "num_input_tokens_seen": 22338848, "step": 22180 }, { "epoch": 10.45968882602546, "grad_norm": 0.18784379959106445, "learning_rate": 2.7356107608828586e-05, "loss": 0.0261, "num_input_tokens_seen": 22344448, "step": 22185 }, { "epoch": 10.462046204620462, "grad_norm": 0.3163072168827057, "learning_rate": 2.7345867187117908e-05, "loss": 0.0327, "num_input_tokens_seen": 22349216, "step": 22190 }, { "epoch": 10.464403583215464, "grad_norm": 0.15448223054409027, "learning_rate": 2.733562636829175e-05, "loss": 0.1413, "num_input_tokens_seen": 22353600, "step": 22195 }, { "epoch": 10.466760961810467, "grad_norm": 2.51511287689209, "learning_rate": 2.73253851540837e-05, "loss": 0.1303, "num_input_tokens_seen": 22359808, "step": 22200 }, { "epoch": 10.469118340405469, "grad_norm": 0.838133692741394, "learning_rate": 2.7315143546227427e-05, "loss": 0.1506, "num_input_tokens_seen": 22363872, "step": 22205 }, { "epoch": 10.471475719000471, "grad_norm": 0.009930922649800777, "learning_rate": 2.730490154645666e-05, "loss": 0.206, "num_input_tokens_seen": 22368512, "step": 22210 }, { "epoch": 10.473833097595474, "grad_norm": 1.6415565013885498, "learning_rate": 2.7294659156505203e-05, "loss": 0.2259, "num_input_tokens_seen": 22372704, "step": 22215 }, { "epoch": 10.476190476190476, "grad_norm": 1.2094796895980835, "learning_rate": 2.72844163781069e-05, "loss": 0.1567, "num_input_tokens_seen": 22377024, "step": 22220 }, { "epoch": 10.478547854785479, "grad_norm": 1.0822497606277466, "learning_rate": 2.72741732129957e-05, "loss": 0.1986, "num_input_tokens_seen": 22382560, "step": 22225 }, { "epoch": 10.480905233380481, "grad_norm": 0.30730488896369934, "learning_rate": 2.7263929662905583e-05, "loss": 0.1503, "num_input_tokens_seen": 22387616, "step": 22230 }, { "epoch": 10.483262611975483, "grad_norm": 0.565738320350647, "learning_rate": 2.7253685729570628e-05, "loss": 0.2191, "num_input_tokens_seen": 22392160, "step": 22235 }, { "epoch": 10.485619990570486, "grad_norm": 0.8403074145317078, "learning_rate": 2.724344141472493e-05, "loss": 0.1178, "num_input_tokens_seen": 22397152, "step": 22240 }, { "epoch": 10.487977369165488, "grad_norm": 1.3523976802825928, "learning_rate": 2.7233196720102693e-05, "loss": 0.1542, "num_input_tokens_seen": 22402112, "step": 22245 }, { "epoch": 10.49033474776049, "grad_norm": 1.593155860900879, "learning_rate": 2.722295164743817e-05, "loss": 0.2039, "num_input_tokens_seen": 22407328, "step": 22250 }, { "epoch": 10.492692126355493, "grad_norm": 0.4559956192970276, "learning_rate": 2.7212706198465682e-05, "loss": 0.0252, "num_input_tokens_seen": 22412480, "step": 22255 }, { "epoch": 10.495049504950495, "grad_norm": 0.12348570674657822, "learning_rate": 2.7202460374919598e-05, "loss": 0.0489, "num_input_tokens_seen": 22418816, "step": 22260 }, { "epoch": 10.497406883545498, "grad_norm": 1.2267322540283203, "learning_rate": 2.719221417853437e-05, "loss": 0.082, "num_input_tokens_seen": 22423296, "step": 22265 }, { "epoch": 10.4997642621405, "grad_norm": 0.2234324812889099, "learning_rate": 2.71819676110445e-05, "loss": 0.0622, "num_input_tokens_seen": 22428064, "step": 22270 }, { "epoch": 10.502121640735503, "grad_norm": 0.3706277310848236, "learning_rate": 2.717172067418457e-05, "loss": 0.0411, "num_input_tokens_seen": 22432928, "step": 22275 }, { "epoch": 10.504479019330505, "grad_norm": 3.163975477218628, "learning_rate": 2.7161473369689194e-05, "loss": 0.119, "num_input_tokens_seen": 22436736, "step": 22280 }, { "epoch": 10.506836397925507, "grad_norm": 1.4705742597579956, "learning_rate": 2.715122569929308e-05, "loss": 0.1373, "num_input_tokens_seen": 22442208, "step": 22285 }, { "epoch": 10.50919377652051, "grad_norm": 0.6944290995597839, "learning_rate": 2.7140977664730986e-05, "loss": 0.2584, "num_input_tokens_seen": 22447168, "step": 22290 }, { "epoch": 10.511551155115512, "grad_norm": 0.8708451390266418, "learning_rate": 2.7130729267737726e-05, "loss": 0.0923, "num_input_tokens_seen": 22452096, "step": 22295 }, { "epoch": 10.513908533710515, "grad_norm": 0.014819081872701645, "learning_rate": 2.7120480510048168e-05, "loss": 0.0709, "num_input_tokens_seen": 22456384, "step": 22300 }, { "epoch": 10.516265912305517, "grad_norm": 0.030332796275615692, "learning_rate": 2.7110231393397268e-05, "loss": 0.039, "num_input_tokens_seen": 22460896, "step": 22305 }, { "epoch": 10.518623290900518, "grad_norm": 0.4323898255825043, "learning_rate": 2.7099981919520024e-05, "loss": 0.2541, "num_input_tokens_seen": 22466464, "step": 22310 }, { "epoch": 10.520980669495522, "grad_norm": 2.679867744445801, "learning_rate": 2.70897320901515e-05, "loss": 0.3366, "num_input_tokens_seen": 22470688, "step": 22315 }, { "epoch": 10.523338048090523, "grad_norm": 0.9957136511802673, "learning_rate": 2.707948190702681e-05, "loss": 0.1078, "num_input_tokens_seen": 22475840, "step": 22320 }, { "epoch": 10.525695426685525, "grad_norm": 0.17191477119922638, "learning_rate": 2.7069231371881138e-05, "loss": 0.1015, "num_input_tokens_seen": 22480032, "step": 22325 }, { "epoch": 10.528052805280527, "grad_norm": 0.3862885534763336, "learning_rate": 2.7058980486449732e-05, "loss": 0.1097, "num_input_tokens_seen": 22484576, "step": 22330 }, { "epoch": 10.53041018387553, "grad_norm": 0.25355440378189087, "learning_rate": 2.704872925246789e-05, "loss": 0.0182, "num_input_tokens_seen": 22489056, "step": 22335 }, { "epoch": 10.532767562470532, "grad_norm": 1.8198491334915161, "learning_rate": 2.703847767167097e-05, "loss": 0.1323, "num_input_tokens_seen": 22492992, "step": 22340 }, { "epoch": 10.535124941065535, "grad_norm": 0.05995410308241844, "learning_rate": 2.702822574579439e-05, "loss": 0.0909, "num_input_tokens_seen": 22498144, "step": 22345 }, { "epoch": 10.537482319660537, "grad_norm": 0.8359562754631042, "learning_rate": 2.7017973476573625e-05, "loss": 0.0647, "num_input_tokens_seen": 22502400, "step": 22350 }, { "epoch": 10.53983969825554, "grad_norm": 0.7915037274360657, "learning_rate": 2.7007720865744224e-05, "loss": 0.0955, "num_input_tokens_seen": 22507360, "step": 22355 }, { "epoch": 10.542197076850542, "grad_norm": 1.1950407028198242, "learning_rate": 2.6997467915041765e-05, "loss": 0.0733, "num_input_tokens_seen": 22512608, "step": 22360 }, { "epoch": 10.544554455445544, "grad_norm": 0.020977871492505074, "learning_rate": 2.69872146262019e-05, "loss": 0.0525, "num_input_tokens_seen": 22516896, "step": 22365 }, { "epoch": 10.546911834040547, "grad_norm": 2.00032377243042, "learning_rate": 2.6976961000960345e-05, "loss": 0.1544, "num_input_tokens_seen": 22522112, "step": 22370 }, { "epoch": 10.549269212635549, "grad_norm": 0.7342687249183655, "learning_rate": 2.6966707041052858e-05, "loss": 0.0792, "num_input_tokens_seen": 22528672, "step": 22375 }, { "epoch": 10.551626591230551, "grad_norm": 0.9836580157279968, "learning_rate": 2.6956452748215255e-05, "loss": 0.1408, "num_input_tokens_seen": 22534592, "step": 22380 }, { "epoch": 10.553983969825554, "grad_norm": 0.06177683174610138, "learning_rate": 2.6946198124183423e-05, "loss": 0.0675, "num_input_tokens_seen": 22539808, "step": 22385 }, { "epoch": 10.556341348420556, "grad_norm": 0.10433358699083328, "learning_rate": 2.6935943170693295e-05, "loss": 0.065, "num_input_tokens_seen": 22544832, "step": 22390 }, { "epoch": 10.558698727015559, "grad_norm": 0.19292868673801422, "learning_rate": 2.6925687889480865e-05, "loss": 0.0894, "num_input_tokens_seen": 22548864, "step": 22395 }, { "epoch": 10.561056105610561, "grad_norm": 0.3469388782978058, "learning_rate": 2.691543228228217e-05, "loss": 0.063, "num_input_tokens_seen": 22552864, "step": 22400 }, { "epoch": 10.563413484205563, "grad_norm": 0.14664509892463684, "learning_rate": 2.6905176350833307e-05, "loss": 0.0338, "num_input_tokens_seen": 22556512, "step": 22405 }, { "epoch": 10.565770862800566, "grad_norm": 0.3075360953807831, "learning_rate": 2.6894920096870435e-05, "loss": 0.0753, "num_input_tokens_seen": 22561280, "step": 22410 }, { "epoch": 10.568128241395568, "grad_norm": 1.0497852563858032, "learning_rate": 2.6884663522129766e-05, "loss": 0.0634, "num_input_tokens_seen": 22565792, "step": 22415 }, { "epoch": 10.57048561999057, "grad_norm": 0.06705127656459808, "learning_rate": 2.6874406628347564e-05, "loss": 0.0909, "num_input_tokens_seen": 22570784, "step": 22420 }, { "epoch": 10.572842998585573, "grad_norm": 0.2477601021528244, "learning_rate": 2.6864149417260142e-05, "loss": 0.1066, "num_input_tokens_seen": 22575776, "step": 22425 }, { "epoch": 10.575200377180575, "grad_norm": 0.12157219648361206, "learning_rate": 2.6853891890603877e-05, "loss": 0.202, "num_input_tokens_seen": 22580864, "step": 22430 }, { "epoch": 10.577557755775578, "grad_norm": 0.30434560775756836, "learning_rate": 2.684363405011519e-05, "loss": 0.0976, "num_input_tokens_seen": 22585440, "step": 22435 }, { "epoch": 10.57991513437058, "grad_norm": 1.9956598281860352, "learning_rate": 2.6833375897530573e-05, "loss": 0.1796, "num_input_tokens_seen": 22591744, "step": 22440 }, { "epoch": 10.582272512965583, "grad_norm": 1.1092714071273804, "learning_rate": 2.682311743458653e-05, "loss": 0.1456, "num_input_tokens_seen": 22597056, "step": 22445 }, { "epoch": 10.584629891560585, "grad_norm": 0.4846936762332916, "learning_rate": 2.6812858663019668e-05, "loss": 0.1631, "num_input_tokens_seen": 22601824, "step": 22450 }, { "epoch": 10.586987270155587, "grad_norm": 0.25009796023368835, "learning_rate": 2.6802599584566613e-05, "loss": 0.0482, "num_input_tokens_seen": 22607008, "step": 22455 }, { "epoch": 10.58934464875059, "grad_norm": 0.902184784412384, "learning_rate": 2.6792340200964056e-05, "loss": 0.1033, "num_input_tokens_seen": 22611968, "step": 22460 }, { "epoch": 10.591702027345592, "grad_norm": 0.747541069984436, "learning_rate": 2.678208051394873e-05, "loss": 0.0933, "num_input_tokens_seen": 22617536, "step": 22465 }, { "epoch": 10.594059405940595, "grad_norm": 0.8092247247695923, "learning_rate": 2.6771820525257434e-05, "loss": 0.1555, "num_input_tokens_seen": 22622464, "step": 22470 }, { "epoch": 10.596416784535597, "grad_norm": 0.01871582306921482, "learning_rate": 2.6761560236627008e-05, "loss": 0.0143, "num_input_tokens_seen": 22627168, "step": 22475 }, { "epoch": 10.5987741631306, "grad_norm": 0.018328873440623283, "learning_rate": 2.675129964979435e-05, "loss": 0.1453, "num_input_tokens_seen": 22631904, "step": 22480 }, { "epoch": 10.601131541725602, "grad_norm": 1.6256029605865479, "learning_rate": 2.674103876649639e-05, "loss": 0.2278, "num_input_tokens_seen": 22636704, "step": 22485 }, { "epoch": 10.603488920320604, "grad_norm": 0.47231897711753845, "learning_rate": 2.6730777588470125e-05, "loss": 0.0465, "num_input_tokens_seen": 22641440, "step": 22490 }, { "epoch": 10.605846298915607, "grad_norm": 0.05515900254249573, "learning_rate": 2.6720516117452604e-05, "loss": 0.2015, "num_input_tokens_seen": 22645952, "step": 22495 }, { "epoch": 10.608203677510609, "grad_norm": 0.03793707862496376, "learning_rate": 2.671025435518092e-05, "loss": 0.1917, "num_input_tokens_seen": 22650912, "step": 22500 }, { "epoch": 10.61056105610561, "grad_norm": 0.3018178641796112, "learning_rate": 2.6699992303392206e-05, "loss": 0.0448, "num_input_tokens_seen": 22654880, "step": 22505 }, { "epoch": 10.612918434700614, "grad_norm": 0.04322989657521248, "learning_rate": 2.668972996382366e-05, "loss": 0.0454, "num_input_tokens_seen": 22659104, "step": 22510 }, { "epoch": 10.615275813295614, "grad_norm": 0.053905121982097626, "learning_rate": 2.667946733821253e-05, "loss": 0.0663, "num_input_tokens_seen": 22664608, "step": 22515 }, { "epoch": 10.617633191890617, "grad_norm": 2.324275255203247, "learning_rate": 2.666920442829609e-05, "loss": 0.2022, "num_input_tokens_seen": 22669632, "step": 22520 }, { "epoch": 10.61999057048562, "grad_norm": 0.3222067058086395, "learning_rate": 2.6658941235811684e-05, "loss": 0.1735, "num_input_tokens_seen": 22674368, "step": 22525 }, { "epoch": 10.622347949080622, "grad_norm": 0.24512794613838196, "learning_rate": 2.6648677762496692e-05, "loss": 0.2019, "num_input_tokens_seen": 22679328, "step": 22530 }, { "epoch": 10.624705327675624, "grad_norm": 1.0777333974838257, "learning_rate": 2.6638414010088546e-05, "loss": 0.1788, "num_input_tokens_seen": 22683520, "step": 22535 }, { "epoch": 10.627062706270626, "grad_norm": 0.5898631811141968, "learning_rate": 2.6628149980324722e-05, "loss": 0.2394, "num_input_tokens_seen": 22688000, "step": 22540 }, { "epoch": 10.629420084865629, "grad_norm": 0.7331210970878601, "learning_rate": 2.6617885674942765e-05, "loss": 0.1466, "num_input_tokens_seen": 22693248, "step": 22545 }, { "epoch": 10.631777463460631, "grad_norm": 1.11675226688385, "learning_rate": 2.6607621095680225e-05, "loss": 0.0683, "num_input_tokens_seen": 22697856, "step": 22550 }, { "epoch": 10.634134842055634, "grad_norm": 0.8942615389823914, "learning_rate": 2.6597356244274724e-05, "loss": 0.0762, "num_input_tokens_seen": 22703616, "step": 22555 }, { "epoch": 10.636492220650636, "grad_norm": 0.3842019736766815, "learning_rate": 2.6587091122463936e-05, "loss": 0.0759, "num_input_tokens_seen": 22708416, "step": 22560 }, { "epoch": 10.638849599245638, "grad_norm": 1.6708804368972778, "learning_rate": 2.6576825731985576e-05, "loss": 0.2111, "num_input_tokens_seen": 22712512, "step": 22565 }, { "epoch": 10.64120697784064, "grad_norm": 2.5671095848083496, "learning_rate": 2.6566560074577383e-05, "loss": 0.1708, "num_input_tokens_seen": 22716768, "step": 22570 }, { "epoch": 10.643564356435643, "grad_norm": 0.901723325252533, "learning_rate": 2.6556294151977168e-05, "loss": 0.0572, "num_input_tokens_seen": 22721664, "step": 22575 }, { "epoch": 10.645921735030646, "grad_norm": 0.16273616254329681, "learning_rate": 2.6546027965922783e-05, "loss": 0.0906, "num_input_tokens_seen": 22727040, "step": 22580 }, { "epoch": 10.648279113625648, "grad_norm": 0.773065984249115, "learning_rate": 2.6535761518152115e-05, "loss": 0.1157, "num_input_tokens_seen": 22731936, "step": 22585 }, { "epoch": 10.65063649222065, "grad_norm": 2.618896007537842, "learning_rate": 2.652549481040309e-05, "loss": 0.2467, "num_input_tokens_seen": 22736352, "step": 22590 }, { "epoch": 10.652993870815653, "grad_norm": 0.1652282476425171, "learning_rate": 2.6515227844413698e-05, "loss": 0.102, "num_input_tokens_seen": 22740576, "step": 22595 }, { "epoch": 10.655351249410655, "grad_norm": 0.9266705513000488, "learning_rate": 2.650496062192196e-05, "loss": 0.173, "num_input_tokens_seen": 22745792, "step": 22600 }, { "epoch": 10.657708628005658, "grad_norm": 0.995171308517456, "learning_rate": 2.6494693144665943e-05, "loss": 0.2231, "num_input_tokens_seen": 22749952, "step": 22605 }, { "epoch": 10.66006600660066, "grad_norm": 0.06342077255249023, "learning_rate": 2.6484425414383756e-05, "loss": 0.0502, "num_input_tokens_seen": 22753952, "step": 22610 }, { "epoch": 10.662423385195662, "grad_norm": 0.47100621461868286, "learning_rate": 2.6474157432813545e-05, "loss": 0.0663, "num_input_tokens_seen": 22758240, "step": 22615 }, { "epoch": 10.664780763790665, "grad_norm": 1.0507912635803223, "learning_rate": 2.6463889201693516e-05, "loss": 0.2497, "num_input_tokens_seen": 22763328, "step": 22620 }, { "epoch": 10.667138142385667, "grad_norm": 0.6217374801635742, "learning_rate": 2.6453620722761896e-05, "loss": 0.0946, "num_input_tokens_seen": 22767648, "step": 22625 }, { "epoch": 10.66949552098067, "grad_norm": 1.3937649726867676, "learning_rate": 2.6443351997756976e-05, "loss": 0.2562, "num_input_tokens_seen": 22772512, "step": 22630 }, { "epoch": 10.671852899575672, "grad_norm": 2.757343053817749, "learning_rate": 2.6433083028417067e-05, "loss": 0.1875, "num_input_tokens_seen": 22777312, "step": 22635 }, { "epoch": 10.674210278170674, "grad_norm": 0.07811181247234344, "learning_rate": 2.642281381648053e-05, "loss": 0.1334, "num_input_tokens_seen": 22781696, "step": 22640 }, { "epoch": 10.676567656765677, "grad_norm": 1.1426254510879517, "learning_rate": 2.641254436368577e-05, "loss": 0.0865, "num_input_tokens_seen": 22786592, "step": 22645 }, { "epoch": 10.67892503536068, "grad_norm": 0.46333470940589905, "learning_rate": 2.6402274671771238e-05, "loss": 0.1755, "num_input_tokens_seen": 22792928, "step": 22650 }, { "epoch": 10.681282413955682, "grad_norm": 0.9297959208488464, "learning_rate": 2.6392004742475405e-05, "loss": 0.0814, "num_input_tokens_seen": 22797568, "step": 22655 }, { "epoch": 10.683639792550684, "grad_norm": 0.8122251033782959, "learning_rate": 2.6381734577536816e-05, "loss": 0.1532, "num_input_tokens_seen": 22801792, "step": 22660 }, { "epoch": 10.685997171145686, "grad_norm": 0.09569874405860901, "learning_rate": 2.6371464178694012e-05, "loss": 0.1156, "num_input_tokens_seen": 22806848, "step": 22665 }, { "epoch": 10.688354549740689, "grad_norm": 0.8941145539283752, "learning_rate": 2.636119354768562e-05, "loss": 0.1597, "num_input_tokens_seen": 22810944, "step": 22670 }, { "epoch": 10.690711928335691, "grad_norm": 1.2037692070007324, "learning_rate": 2.6350922686250263e-05, "loss": 0.1027, "num_input_tokens_seen": 22816096, "step": 22675 }, { "epoch": 10.693069306930694, "grad_norm": 0.3248058259487152, "learning_rate": 2.6340651596126636e-05, "loss": 0.0436, "num_input_tokens_seen": 22821408, "step": 22680 }, { "epoch": 10.695426685525696, "grad_norm": 0.9152326583862305, "learning_rate": 2.6330380279053456e-05, "loss": 0.1192, "num_input_tokens_seen": 22825632, "step": 22685 }, { "epoch": 10.697784064120698, "grad_norm": 1.8021103143692017, "learning_rate": 2.632010873676949e-05, "loss": 0.2843, "num_input_tokens_seen": 22830272, "step": 22690 }, { "epoch": 10.700141442715701, "grad_norm": 0.6773754954338074, "learning_rate": 2.6309836971013517e-05, "loss": 0.1409, "num_input_tokens_seen": 22835008, "step": 22695 }, { "epoch": 10.702498821310703, "grad_norm": 0.4281509816646576, "learning_rate": 2.6299564983524388e-05, "loss": 0.2197, "num_input_tokens_seen": 22840512, "step": 22700 }, { "epoch": 10.704856199905706, "grad_norm": 0.38677820563316345, "learning_rate": 2.6289292776040975e-05, "loss": 0.0903, "num_input_tokens_seen": 22845376, "step": 22705 }, { "epoch": 10.707213578500706, "grad_norm": 0.7707657814025879, "learning_rate": 2.627902035030219e-05, "loss": 0.1508, "num_input_tokens_seen": 22850144, "step": 22710 }, { "epoch": 10.70957095709571, "grad_norm": 2.6796305179595947, "learning_rate": 2.626874770804697e-05, "loss": 0.1221, "num_input_tokens_seen": 22855264, "step": 22715 }, { "epoch": 10.711928335690711, "grad_norm": 0.18824666738510132, "learning_rate": 2.625847485101431e-05, "loss": 0.118, "num_input_tokens_seen": 22860064, "step": 22720 }, { "epoch": 10.714285714285714, "grad_norm": 0.4883800148963928, "learning_rate": 2.6248201780943222e-05, "loss": 0.0988, "num_input_tokens_seen": 22865920, "step": 22725 }, { "epoch": 10.716643092880716, "grad_norm": 0.1840713918209076, "learning_rate": 2.6237928499572766e-05, "loss": 0.0448, "num_input_tokens_seen": 22870016, "step": 22730 }, { "epoch": 10.719000471475718, "grad_norm": 0.11711424589157104, "learning_rate": 2.6227655008642036e-05, "loss": 0.1552, "num_input_tokens_seen": 22876800, "step": 22735 }, { "epoch": 10.72135785007072, "grad_norm": 1.5957624912261963, "learning_rate": 2.621738130989016e-05, "loss": 0.2183, "num_input_tokens_seen": 22881376, "step": 22740 }, { "epoch": 10.723715228665723, "grad_norm": 1.2857317924499512, "learning_rate": 2.6207107405056293e-05, "loss": 0.1229, "num_input_tokens_seen": 22886208, "step": 22745 }, { "epoch": 10.726072607260726, "grad_norm": 1.0814764499664307, "learning_rate": 2.6196833295879637e-05, "loss": 0.2207, "num_input_tokens_seen": 22891808, "step": 22750 }, { "epoch": 10.728429985855728, "grad_norm": 0.5150745511054993, "learning_rate": 2.618655898409943e-05, "loss": 0.1377, "num_input_tokens_seen": 22896544, "step": 22755 }, { "epoch": 10.73078736445073, "grad_norm": 0.2513306140899658, "learning_rate": 2.6176284471454932e-05, "loss": 0.0315, "num_input_tokens_seen": 22901504, "step": 22760 }, { "epoch": 10.733144743045733, "grad_norm": 0.35326483845710754, "learning_rate": 2.616600975968544e-05, "loss": 0.023, "num_input_tokens_seen": 22906432, "step": 22765 }, { "epoch": 10.735502121640735, "grad_norm": 3.5079002380371094, "learning_rate": 2.6155734850530296e-05, "loss": 0.1859, "num_input_tokens_seen": 22910656, "step": 22770 }, { "epoch": 10.737859500235738, "grad_norm": 1.930977463722229, "learning_rate": 2.6145459745728874e-05, "loss": 0.0804, "num_input_tokens_seen": 22914976, "step": 22775 }, { "epoch": 10.74021687883074, "grad_norm": 0.5160693526268005, "learning_rate": 2.6135184447020557e-05, "loss": 0.0638, "num_input_tokens_seen": 22920768, "step": 22780 }, { "epoch": 10.742574257425742, "grad_norm": 0.32246577739715576, "learning_rate": 2.6124908956144784e-05, "loss": 0.0836, "num_input_tokens_seen": 22926528, "step": 22785 }, { "epoch": 10.744931636020745, "grad_norm": 0.5713495016098022, "learning_rate": 2.611463327484103e-05, "loss": 0.0828, "num_input_tokens_seen": 22931040, "step": 22790 }, { "epoch": 10.747289014615747, "grad_norm": 1.1562026739120483, "learning_rate": 2.610435740484879e-05, "loss": 0.1132, "num_input_tokens_seen": 22936736, "step": 22795 }, { "epoch": 10.74964639321075, "grad_norm": 0.10229463875293732, "learning_rate": 2.6094081347907588e-05, "loss": 0.0292, "num_input_tokens_seen": 22941408, "step": 22800 }, { "epoch": 10.752003771805752, "grad_norm": 1.3527673482894897, "learning_rate": 2.6083805105756993e-05, "loss": 0.0742, "num_input_tokens_seen": 22946528, "step": 22805 }, { "epoch": 10.754361150400754, "grad_norm": 0.1010524183511734, "learning_rate": 2.6073528680136588e-05, "loss": 0.041, "num_input_tokens_seen": 22950880, "step": 22810 }, { "epoch": 10.756718528995757, "grad_norm": 0.5116051435470581, "learning_rate": 2.6063252072786016e-05, "loss": 0.1808, "num_input_tokens_seen": 22955232, "step": 22815 }, { "epoch": 10.75907590759076, "grad_norm": 0.19746799767017365, "learning_rate": 2.6052975285444914e-05, "loss": 0.2133, "num_input_tokens_seen": 22960384, "step": 22820 }, { "epoch": 10.761433286185762, "grad_norm": 0.1234419122338295, "learning_rate": 2.6042698319852975e-05, "loss": 0.033, "num_input_tokens_seen": 22964544, "step": 22825 }, { "epoch": 10.763790664780764, "grad_norm": 0.16728657484054565, "learning_rate": 2.6032421177749917e-05, "loss": 0.0368, "num_input_tokens_seen": 22969376, "step": 22830 }, { "epoch": 10.766148043375766, "grad_norm": 0.42290154099464417, "learning_rate": 2.6022143860875485e-05, "loss": 0.064, "num_input_tokens_seen": 22974784, "step": 22835 }, { "epoch": 10.768505421970769, "grad_norm": 0.09652838110923767, "learning_rate": 2.601186637096945e-05, "loss": 0.1169, "num_input_tokens_seen": 22979168, "step": 22840 }, { "epoch": 10.770862800565771, "grad_norm": 3.269984483718872, "learning_rate": 2.600158870977162e-05, "loss": 0.1356, "num_input_tokens_seen": 22984096, "step": 22845 }, { "epoch": 10.773220179160774, "grad_norm": 1.1474169492721558, "learning_rate": 2.599131087902183e-05, "loss": 0.0985, "num_input_tokens_seen": 22988832, "step": 22850 }, { "epoch": 10.775577557755776, "grad_norm": 0.4667089581489563, "learning_rate": 2.5981032880459932e-05, "loss": 0.2404, "num_input_tokens_seen": 22993952, "step": 22855 }, { "epoch": 10.777934936350778, "grad_norm": 0.6711676716804504, "learning_rate": 2.5970754715825836e-05, "loss": 0.0992, "num_input_tokens_seen": 22998688, "step": 22860 }, { "epoch": 10.78029231494578, "grad_norm": 0.7435174584388733, "learning_rate": 2.596047638685944e-05, "loss": 0.0624, "num_input_tokens_seen": 23003200, "step": 22865 }, { "epoch": 10.782649693540783, "grad_norm": 0.4764935076236725, "learning_rate": 2.59501978953007e-05, "loss": 0.1886, "num_input_tokens_seen": 23008288, "step": 22870 }, { "epoch": 10.785007072135786, "grad_norm": 1.1184312105178833, "learning_rate": 2.593991924288959e-05, "loss": 0.2403, "num_input_tokens_seen": 23012704, "step": 22875 }, { "epoch": 10.787364450730788, "grad_norm": 1.1680570840835571, "learning_rate": 2.5929640431366114e-05, "loss": 0.0671, "num_input_tokens_seen": 23018208, "step": 22880 }, { "epoch": 10.78972182932579, "grad_norm": 0.04353463649749756, "learning_rate": 2.5919361462470294e-05, "loss": 0.0873, "num_input_tokens_seen": 23022880, "step": 22885 }, { "epoch": 10.792079207920793, "grad_norm": 1.1143708229064941, "learning_rate": 2.5909082337942185e-05, "loss": 0.0717, "num_input_tokens_seen": 23028000, "step": 22890 }, { "epoch": 10.794436586515795, "grad_norm": 0.11841019243001938, "learning_rate": 2.5898803059521874e-05, "loss": 0.0694, "num_input_tokens_seen": 23033216, "step": 22895 }, { "epoch": 10.796793965110798, "grad_norm": 1.4349403381347656, "learning_rate": 2.5888523628949472e-05, "loss": 0.1306, "num_input_tokens_seen": 23038240, "step": 22900 }, { "epoch": 10.799151343705798, "grad_norm": 0.10657990723848343, "learning_rate": 2.58782440479651e-05, "loss": 0.1024, "num_input_tokens_seen": 23042848, "step": 22905 }, { "epoch": 10.801508722300802, "grad_norm": 0.30667927861213684, "learning_rate": 2.5867964318308922e-05, "loss": 0.1615, "num_input_tokens_seen": 23047136, "step": 22910 }, { "epoch": 10.803866100895803, "grad_norm": 1.383366346359253, "learning_rate": 2.5857684441721124e-05, "loss": 0.1165, "num_input_tokens_seen": 23051968, "step": 22915 }, { "epoch": 10.806223479490805, "grad_norm": 1.4252030849456787, "learning_rate": 2.5847404419941917e-05, "loss": 0.1349, "num_input_tokens_seen": 23056992, "step": 22920 }, { "epoch": 10.808580858085808, "grad_norm": 1.6090117692947388, "learning_rate": 2.5837124254711524e-05, "loss": 0.1763, "num_input_tokens_seen": 23063072, "step": 22925 }, { "epoch": 10.81093823668081, "grad_norm": 0.12097008526325226, "learning_rate": 2.5826843947770212e-05, "loss": 0.2828, "num_input_tokens_seen": 23070944, "step": 22930 }, { "epoch": 10.813295615275813, "grad_norm": 0.04828781262040138, "learning_rate": 2.5816563500858254e-05, "loss": 0.065, "num_input_tokens_seen": 23074912, "step": 22935 }, { "epoch": 10.815652993870815, "grad_norm": 0.15956975519657135, "learning_rate": 2.5806282915715967e-05, "loss": 0.0845, "num_input_tokens_seen": 23080512, "step": 22940 }, { "epoch": 10.818010372465817, "grad_norm": 0.20727220177650452, "learning_rate": 2.579600219408367e-05, "loss": 0.1842, "num_input_tokens_seen": 23085792, "step": 22945 }, { "epoch": 10.82036775106082, "grad_norm": 0.37210506200790405, "learning_rate": 2.5785721337701706e-05, "loss": 0.0399, "num_input_tokens_seen": 23090784, "step": 22950 }, { "epoch": 10.822725129655822, "grad_norm": 0.08588477969169617, "learning_rate": 2.577544034831047e-05, "loss": 0.2223, "num_input_tokens_seen": 23094880, "step": 22955 }, { "epoch": 10.825082508250825, "grad_norm": 1.2856764793395996, "learning_rate": 2.5765159227650353e-05, "loss": 0.1388, "num_input_tokens_seen": 23099136, "step": 22960 }, { "epoch": 10.827439886845827, "grad_norm": 1.2876471281051636, "learning_rate": 2.5754877977461767e-05, "loss": 0.1606, "num_input_tokens_seen": 23103904, "step": 22965 }, { "epoch": 10.82979726544083, "grad_norm": 0.14250043034553528, "learning_rate": 2.5744596599485156e-05, "loss": 0.0912, "num_input_tokens_seen": 23108064, "step": 22970 }, { "epoch": 10.832154644035832, "grad_norm": 1.4466438293457031, "learning_rate": 2.5734315095460982e-05, "loss": 0.0493, "num_input_tokens_seen": 23112704, "step": 22975 }, { "epoch": 10.834512022630834, "grad_norm": 0.686607301235199, "learning_rate": 2.5724033467129726e-05, "loss": 0.153, "num_input_tokens_seen": 23118080, "step": 22980 }, { "epoch": 10.836869401225837, "grad_norm": 1.7723731994628906, "learning_rate": 2.571375171623191e-05, "loss": 0.1273, "num_input_tokens_seen": 23123072, "step": 22985 }, { "epoch": 10.839226779820839, "grad_norm": 1.7506321668624878, "learning_rate": 2.5703469844508038e-05, "loss": 0.0993, "num_input_tokens_seen": 23128768, "step": 22990 }, { "epoch": 10.841584158415841, "grad_norm": 0.4792102575302124, "learning_rate": 2.5693187853698664e-05, "loss": 0.2889, "num_input_tokens_seen": 23134400, "step": 22995 }, { "epoch": 10.843941537010844, "grad_norm": 0.1624113917350769, "learning_rate": 2.5682905745544354e-05, "loss": 0.2165, "num_input_tokens_seen": 23139232, "step": 23000 }, { "epoch": 10.846298915605846, "grad_norm": 0.07913282513618469, "learning_rate": 2.567262352178571e-05, "loss": 0.1153, "num_input_tokens_seen": 23144032, "step": 23005 }, { "epoch": 10.848656294200849, "grad_norm": 0.11785006523132324, "learning_rate": 2.566234118416331e-05, "loss": 0.0674, "num_input_tokens_seen": 23150080, "step": 23010 }, { "epoch": 10.851013672795851, "grad_norm": 1.539531946182251, "learning_rate": 2.56520587344178e-05, "loss": 0.3021, "num_input_tokens_seen": 23155328, "step": 23015 }, { "epoch": 10.853371051390853, "grad_norm": 1.553033709526062, "learning_rate": 2.5641776174289816e-05, "loss": 0.1907, "num_input_tokens_seen": 23160544, "step": 23020 }, { "epoch": 10.855728429985856, "grad_norm": 1.3880305290222168, "learning_rate": 2.5631493505520028e-05, "loss": 0.0661, "num_input_tokens_seen": 23164960, "step": 23025 }, { "epoch": 10.858085808580858, "grad_norm": 0.10889008641242981, "learning_rate": 2.5621210729849105e-05, "loss": 0.159, "num_input_tokens_seen": 23170016, "step": 23030 }, { "epoch": 10.86044318717586, "grad_norm": 0.03506096825003624, "learning_rate": 2.561092784901775e-05, "loss": 0.0577, "num_input_tokens_seen": 23174912, "step": 23035 }, { "epoch": 10.862800565770863, "grad_norm": 1.15422523021698, "learning_rate": 2.5600644864766687e-05, "loss": 0.2291, "num_input_tokens_seen": 23179232, "step": 23040 }, { "epoch": 10.865157944365865, "grad_norm": 0.07444324344396591, "learning_rate": 2.5590361778836652e-05, "loss": 0.0623, "num_input_tokens_seen": 23184960, "step": 23045 }, { "epoch": 10.867515322960868, "grad_norm": 0.7529807686805725, "learning_rate": 2.558007859296839e-05, "loss": 0.101, "num_input_tokens_seen": 23189312, "step": 23050 }, { "epoch": 10.86987270155587, "grad_norm": 0.1466115564107895, "learning_rate": 2.556979530890266e-05, "loss": 0.0476, "num_input_tokens_seen": 23193824, "step": 23055 }, { "epoch": 10.872230080150873, "grad_norm": 0.8032283782958984, "learning_rate": 2.555951192838027e-05, "loss": 0.2348, "num_input_tokens_seen": 23198464, "step": 23060 }, { "epoch": 10.874587458745875, "grad_norm": 0.19094626605510712, "learning_rate": 2.554922845314201e-05, "loss": 0.0999, "num_input_tokens_seen": 23204416, "step": 23065 }, { "epoch": 10.876944837340877, "grad_norm": 0.06416788697242737, "learning_rate": 2.5538944884928694e-05, "loss": 0.1312, "num_input_tokens_seen": 23210400, "step": 23070 }, { "epoch": 10.87930221593588, "grad_norm": 0.036053530871868134, "learning_rate": 2.5528661225481165e-05, "loss": 0.0371, "num_input_tokens_seen": 23215840, "step": 23075 }, { "epoch": 10.881659594530882, "grad_norm": 0.08092567324638367, "learning_rate": 2.551837747654027e-05, "loss": 0.2309, "num_input_tokens_seen": 23221632, "step": 23080 }, { "epoch": 10.884016973125885, "grad_norm": 0.1148085966706276, "learning_rate": 2.5508093639846857e-05, "loss": 0.0874, "num_input_tokens_seen": 23226176, "step": 23085 }, { "epoch": 10.886374351720887, "grad_norm": 0.2841644287109375, "learning_rate": 2.549780971714183e-05, "loss": 0.0806, "num_input_tokens_seen": 23231040, "step": 23090 }, { "epoch": 10.88873173031589, "grad_norm": 1.2975786924362183, "learning_rate": 2.548752571016606e-05, "loss": 0.1059, "num_input_tokens_seen": 23236192, "step": 23095 }, { "epoch": 10.891089108910892, "grad_norm": 0.7008731961250305, "learning_rate": 2.5477241620660468e-05, "loss": 0.0889, "num_input_tokens_seen": 23240640, "step": 23100 }, { "epoch": 10.893446487505894, "grad_norm": 1.1072858572006226, "learning_rate": 2.5466957450365976e-05, "loss": 0.1297, "num_input_tokens_seen": 23246752, "step": 23105 }, { "epoch": 10.895803866100895, "grad_norm": 0.42553824186325073, "learning_rate": 2.5456673201023513e-05, "loss": 0.0739, "num_input_tokens_seen": 23251904, "step": 23110 }, { "epoch": 10.898161244695899, "grad_norm": 0.12202386558055878, "learning_rate": 2.5446388874374034e-05, "loss": 0.1292, "num_input_tokens_seen": 23256128, "step": 23115 }, { "epoch": 10.9005186232909, "grad_norm": 0.3089841306209564, "learning_rate": 2.543610447215849e-05, "loss": 0.1867, "num_input_tokens_seen": 23260864, "step": 23120 }, { "epoch": 10.902876001885902, "grad_norm": 2.055222749710083, "learning_rate": 2.5425819996117873e-05, "loss": 0.1968, "num_input_tokens_seen": 23266176, "step": 23125 }, { "epoch": 10.905233380480905, "grad_norm": 0.5715563297271729, "learning_rate": 2.541553544799316e-05, "loss": 0.1471, "num_input_tokens_seen": 23271072, "step": 23130 }, { "epoch": 10.907590759075907, "grad_norm": 0.5174059271812439, "learning_rate": 2.5405250829525344e-05, "loss": 0.0426, "num_input_tokens_seen": 23276448, "step": 23135 }, { "epoch": 10.90994813767091, "grad_norm": 1.3279613256454468, "learning_rate": 2.5394966142455445e-05, "loss": 0.0732, "num_input_tokens_seen": 23281536, "step": 23140 }, { "epoch": 10.912305516265912, "grad_norm": 0.5949222445487976, "learning_rate": 2.5384681388524488e-05, "loss": 0.0725, "num_input_tokens_seen": 23289408, "step": 23145 }, { "epoch": 10.914662894860914, "grad_norm": 0.5261401534080505, "learning_rate": 2.5374396569473506e-05, "loss": 0.0909, "num_input_tokens_seen": 23294112, "step": 23150 }, { "epoch": 10.917020273455917, "grad_norm": 0.29083457589149475, "learning_rate": 2.5364111687043535e-05, "loss": 0.1161, "num_input_tokens_seen": 23298944, "step": 23155 }, { "epoch": 10.919377652050919, "grad_norm": 0.6048810482025146, "learning_rate": 2.535382674297564e-05, "loss": 0.0351, "num_input_tokens_seen": 23303616, "step": 23160 }, { "epoch": 10.921735030645921, "grad_norm": 1.9534611701965332, "learning_rate": 2.5343541739010884e-05, "loss": 0.0956, "num_input_tokens_seen": 23308512, "step": 23165 }, { "epoch": 10.924092409240924, "grad_norm": 0.048118188977241516, "learning_rate": 2.533325667689035e-05, "loss": 0.0582, "num_input_tokens_seen": 23312832, "step": 23170 }, { "epoch": 10.926449787835926, "grad_norm": 0.6705607771873474, "learning_rate": 2.5322971558355112e-05, "loss": 0.0558, "num_input_tokens_seen": 23317888, "step": 23175 }, { "epoch": 10.928807166430929, "grad_norm": 1.9452435970306396, "learning_rate": 2.531268638514627e-05, "loss": 0.1752, "num_input_tokens_seen": 23322432, "step": 23180 }, { "epoch": 10.931164545025931, "grad_norm": 0.236991286277771, "learning_rate": 2.530240115900494e-05, "loss": 0.1246, "num_input_tokens_seen": 23326784, "step": 23185 }, { "epoch": 10.933521923620933, "grad_norm": 1.1520782709121704, "learning_rate": 2.529211588167223e-05, "loss": 0.0452, "num_input_tokens_seen": 23332928, "step": 23190 }, { "epoch": 10.935879302215936, "grad_norm": 1.5205485820770264, "learning_rate": 2.5281830554889262e-05, "loss": 0.0901, "num_input_tokens_seen": 23337824, "step": 23195 }, { "epoch": 10.938236680810938, "grad_norm": 0.15030290186405182, "learning_rate": 2.527154518039716e-05, "loss": 0.1581, "num_input_tokens_seen": 23343136, "step": 23200 }, { "epoch": 10.94059405940594, "grad_norm": 0.09002699702978134, "learning_rate": 2.5261259759937072e-05, "loss": 0.0647, "num_input_tokens_seen": 23349056, "step": 23205 }, { "epoch": 10.942951438000943, "grad_norm": 0.3100588917732239, "learning_rate": 2.525097429525014e-05, "loss": 0.1095, "num_input_tokens_seen": 23354368, "step": 23210 }, { "epoch": 10.945308816595945, "grad_norm": 0.78585284948349, "learning_rate": 2.5240688788077527e-05, "loss": 0.1518, "num_input_tokens_seen": 23359552, "step": 23215 }, { "epoch": 10.947666195190948, "grad_norm": 1.2842737436294556, "learning_rate": 2.5230403240160383e-05, "loss": 0.0584, "num_input_tokens_seen": 23364608, "step": 23220 }, { "epoch": 10.95002357378595, "grad_norm": 0.11174644529819489, "learning_rate": 2.5220117653239882e-05, "loss": 0.0813, "num_input_tokens_seen": 23368992, "step": 23225 }, { "epoch": 10.952380952380953, "grad_norm": 0.8359617590904236, "learning_rate": 2.5209832029057202e-05, "loss": 0.0643, "num_input_tokens_seen": 23374368, "step": 23230 }, { "epoch": 10.954738330975955, "grad_norm": 0.09720829874277115, "learning_rate": 2.5199546369353522e-05, "loss": 0.0944, "num_input_tokens_seen": 23379392, "step": 23235 }, { "epoch": 10.957095709570957, "grad_norm": 0.030577844008803368, "learning_rate": 2.518926067587003e-05, "loss": 0.1685, "num_input_tokens_seen": 23383232, "step": 23240 }, { "epoch": 10.95945308816596, "grad_norm": 0.03838295862078667, "learning_rate": 2.5178974950347917e-05, "loss": 0.1221, "num_input_tokens_seen": 23388384, "step": 23245 }, { "epoch": 10.961810466760962, "grad_norm": 1.4528483152389526, "learning_rate": 2.5168689194528382e-05, "loss": 0.1313, "num_input_tokens_seen": 23393312, "step": 23250 }, { "epoch": 10.964167845355965, "grad_norm": 0.07362936437129974, "learning_rate": 2.5158403410152643e-05, "loss": 0.0635, "num_input_tokens_seen": 23398880, "step": 23255 }, { "epoch": 10.966525223950967, "grad_norm": 0.26338598132133484, "learning_rate": 2.5148117598961883e-05, "loss": 0.105, "num_input_tokens_seen": 23403136, "step": 23260 }, { "epoch": 10.96888260254597, "grad_norm": 0.5852299332618713, "learning_rate": 2.5137831762697333e-05, "loss": 0.1783, "num_input_tokens_seen": 23408288, "step": 23265 }, { "epoch": 10.971239981140972, "grad_norm": 0.554311215877533, "learning_rate": 2.5127545903100202e-05, "loss": 0.0729, "num_input_tokens_seen": 23412608, "step": 23270 }, { "epoch": 10.973597359735974, "grad_norm": 0.8098270893096924, "learning_rate": 2.5117260021911727e-05, "loss": 0.2157, "num_input_tokens_seen": 23416704, "step": 23275 }, { "epoch": 10.975954738330977, "grad_norm": 0.6551787257194519, "learning_rate": 2.510697412087311e-05, "loss": 0.0446, "num_input_tokens_seen": 23421216, "step": 23280 }, { "epoch": 10.978312116925979, "grad_norm": 0.8848264217376709, "learning_rate": 2.5096688201725593e-05, "loss": 0.0446, "num_input_tokens_seen": 23426304, "step": 23285 }, { "epoch": 10.980669495520981, "grad_norm": 0.1217896044254303, "learning_rate": 2.50864022662104e-05, "loss": 0.1145, "num_input_tokens_seen": 23431424, "step": 23290 }, { "epoch": 10.983026874115984, "grad_norm": 0.22713449597358704, "learning_rate": 2.507611631606878e-05, "loss": 0.0745, "num_input_tokens_seen": 23436992, "step": 23295 }, { "epoch": 10.985384252710986, "grad_norm": 1.4317818880081177, "learning_rate": 2.5065830353041962e-05, "loss": 0.2269, "num_input_tokens_seen": 23441408, "step": 23300 }, { "epoch": 10.987741631305987, "grad_norm": 1.0574181079864502, "learning_rate": 2.5055544378871178e-05, "loss": 0.0821, "num_input_tokens_seen": 23446720, "step": 23305 }, { "epoch": 10.990099009900991, "grad_norm": 0.05632971227169037, "learning_rate": 2.504525839529767e-05, "loss": 0.1738, "num_input_tokens_seen": 23450976, "step": 23310 }, { "epoch": 10.992456388495992, "grad_norm": 0.3492273986339569, "learning_rate": 2.503497240406269e-05, "loss": 0.1434, "num_input_tokens_seen": 23457440, "step": 23315 }, { "epoch": 10.994813767090994, "grad_norm": 2.33290696144104, "learning_rate": 2.502468640690747e-05, "loss": 0.0916, "num_input_tokens_seen": 23462112, "step": 23320 }, { "epoch": 10.997171145685996, "grad_norm": 0.27963951230049133, "learning_rate": 2.5014400405573262e-05, "loss": 0.0439, "num_input_tokens_seen": 23466912, "step": 23325 }, { "epoch": 10.999528524280999, "grad_norm": 0.10782792419195175, "learning_rate": 2.500411440180131e-05, "loss": 0.14, "num_input_tokens_seen": 23471712, "step": 23330 }, { "epoch": 11.0, "eval_loss": 0.15143364667892456, "eval_runtime": 15.1635, "eval_samples_per_second": 62.189, "eval_steps_per_second": 15.564, "num_input_tokens_seen": 23472352, "step": 23331 }, { "epoch": 11.001885902876001, "grad_norm": 1.2175828218460083, "learning_rate": 2.499382839733286e-05, "loss": 0.1103, "num_input_tokens_seen": 23476192, "step": 23335 }, { "epoch": 11.004243281471004, "grad_norm": 2.04282808303833, "learning_rate": 2.4983542393909156e-05, "loss": 0.1692, "num_input_tokens_seen": 23481728, "step": 23340 }, { "epoch": 11.006600660066006, "grad_norm": 0.2739919424057007, "learning_rate": 2.4973256393271448e-05, "loss": 0.0745, "num_input_tokens_seen": 23487552, "step": 23345 }, { "epoch": 11.008958038661008, "grad_norm": 0.020164526998996735, "learning_rate": 2.4962970397160975e-05, "loss": 0.1607, "num_input_tokens_seen": 23491744, "step": 23350 }, { "epoch": 11.01131541725601, "grad_norm": 1.3893903493881226, "learning_rate": 2.4952684407319e-05, "loss": 0.1732, "num_input_tokens_seen": 23497888, "step": 23355 }, { "epoch": 11.013672795851013, "grad_norm": 0.9086114168167114, "learning_rate": 2.4942398425486745e-05, "loss": 0.2163, "num_input_tokens_seen": 23504512, "step": 23360 }, { "epoch": 11.016030174446016, "grad_norm": 0.2900054156780243, "learning_rate": 2.493211245340545e-05, "loss": 0.1341, "num_input_tokens_seen": 23508704, "step": 23365 }, { "epoch": 11.018387553041018, "grad_norm": 0.08809980005025864, "learning_rate": 2.4921826492816366e-05, "loss": 0.0744, "num_input_tokens_seen": 23513632, "step": 23370 }, { "epoch": 11.02074493163602, "grad_norm": 0.15360960364341736, "learning_rate": 2.491154054546073e-05, "loss": 0.1195, "num_input_tokens_seen": 23518560, "step": 23375 }, { "epoch": 11.023102310231023, "grad_norm": 0.6053917407989502, "learning_rate": 2.490125461307978e-05, "loss": 0.0397, "num_input_tokens_seen": 23523520, "step": 23380 }, { "epoch": 11.025459688826025, "grad_norm": 1.1459609270095825, "learning_rate": 2.4890968697414747e-05, "loss": 0.0441, "num_input_tokens_seen": 23528192, "step": 23385 }, { "epoch": 11.027817067421028, "grad_norm": 0.7193508148193359, "learning_rate": 2.488068280020686e-05, "loss": 0.1262, "num_input_tokens_seen": 23533856, "step": 23390 }, { "epoch": 11.03017444601603, "grad_norm": 1.2301656007766724, "learning_rate": 2.4870396923197348e-05, "loss": 0.1675, "num_input_tokens_seen": 23538016, "step": 23395 }, { "epoch": 11.032531824611032, "grad_norm": 0.6331346035003662, "learning_rate": 2.486011106812744e-05, "loss": 0.0304, "num_input_tokens_seen": 23543776, "step": 23400 }, { "epoch": 11.034889203206035, "grad_norm": 0.7569564580917358, "learning_rate": 2.484982523673835e-05, "loss": 0.1218, "num_input_tokens_seen": 23548256, "step": 23405 }, { "epoch": 11.037246581801037, "grad_norm": 1.2319786548614502, "learning_rate": 2.4839539430771296e-05, "loss": 0.2219, "num_input_tokens_seen": 23553472, "step": 23410 }, { "epoch": 11.03960396039604, "grad_norm": 0.02718391641974449, "learning_rate": 2.4829253651967495e-05, "loss": 0.0901, "num_input_tokens_seen": 23557856, "step": 23415 }, { "epoch": 11.041961338991042, "grad_norm": 0.088328517973423, "learning_rate": 2.4818967902068158e-05, "loss": 0.1353, "num_input_tokens_seen": 23563232, "step": 23420 }, { "epoch": 11.044318717586044, "grad_norm": 1.1991512775421143, "learning_rate": 2.4808682182814473e-05, "loss": 0.1445, "num_input_tokens_seen": 23568768, "step": 23425 }, { "epoch": 11.046676096181047, "grad_norm": 0.09473709017038345, "learning_rate": 2.4798396495947645e-05, "loss": 0.0605, "num_input_tokens_seen": 23573024, "step": 23430 }, { "epoch": 11.04903347477605, "grad_norm": 2.113351345062256, "learning_rate": 2.478811084320887e-05, "loss": 0.0974, "num_input_tokens_seen": 23577568, "step": 23435 }, { "epoch": 11.051390853371052, "grad_norm": 0.33716902136802673, "learning_rate": 2.4777825226339337e-05, "loss": 0.1642, "num_input_tokens_seen": 23582528, "step": 23440 }, { "epoch": 11.053748231966054, "grad_norm": 1.0658470392227173, "learning_rate": 2.476753964708021e-05, "loss": 0.1626, "num_input_tokens_seen": 23587072, "step": 23445 }, { "epoch": 11.056105610561056, "grad_norm": 1.7771942615509033, "learning_rate": 2.4757254107172682e-05, "loss": 0.16, "num_input_tokens_seen": 23592320, "step": 23450 }, { "epoch": 11.058462989156059, "grad_norm": 0.6050965189933777, "learning_rate": 2.4746968608357905e-05, "loss": 0.1165, "num_input_tokens_seen": 23597088, "step": 23455 }, { "epoch": 11.060820367751061, "grad_norm": 0.2574275732040405, "learning_rate": 2.4736683152377056e-05, "loss": 0.0492, "num_input_tokens_seen": 23602944, "step": 23460 }, { "epoch": 11.063177746346064, "grad_norm": 2.689032793045044, "learning_rate": 2.4726397740971272e-05, "loss": 0.0684, "num_input_tokens_seen": 23607424, "step": 23465 }, { "epoch": 11.065535124941066, "grad_norm": 1.1253652572631836, "learning_rate": 2.4716112375881704e-05, "loss": 0.062, "num_input_tokens_seen": 23612448, "step": 23470 }, { "epoch": 11.067892503536068, "grad_norm": 0.11018339544534683, "learning_rate": 2.470582705884949e-05, "loss": 0.1008, "num_input_tokens_seen": 23617888, "step": 23475 }, { "epoch": 11.07024988213107, "grad_norm": 0.6976273059844971, "learning_rate": 2.469554179161577e-05, "loss": 0.0613, "num_input_tokens_seen": 23623744, "step": 23480 }, { "epoch": 11.072607260726073, "grad_norm": 0.6442989706993103, "learning_rate": 2.4685256575921643e-05, "loss": 0.075, "num_input_tokens_seen": 23628256, "step": 23485 }, { "epoch": 11.074964639321076, "grad_norm": 0.4840449392795563, "learning_rate": 2.467497141350824e-05, "loss": 0.0728, "num_input_tokens_seen": 23633024, "step": 23490 }, { "epoch": 11.077322017916078, "grad_norm": 0.17388522624969482, "learning_rate": 2.4664686306116656e-05, "loss": 0.1193, "num_input_tokens_seen": 23637792, "step": 23495 }, { "epoch": 11.07967939651108, "grad_norm": 0.2094157487154007, "learning_rate": 2.465440125548799e-05, "loss": 0.123, "num_input_tokens_seen": 23643232, "step": 23500 }, { "epoch": 11.082036775106083, "grad_norm": 0.6161404252052307, "learning_rate": 2.4644116263363323e-05, "loss": 0.0784, "num_input_tokens_seen": 23647360, "step": 23505 }, { "epoch": 11.084394153701085, "grad_norm": 1.4007400274276733, "learning_rate": 2.4633831331483724e-05, "loss": 0.2101, "num_input_tokens_seen": 23652192, "step": 23510 }, { "epoch": 11.086751532296086, "grad_norm": 1.0344586372375488, "learning_rate": 2.4623546461590267e-05, "loss": 0.2448, "num_input_tokens_seen": 23657792, "step": 23515 }, { "epoch": 11.089108910891088, "grad_norm": 0.2312472015619278, "learning_rate": 2.461326165542401e-05, "loss": 0.0485, "num_input_tokens_seen": 23663328, "step": 23520 }, { "epoch": 11.09146628948609, "grad_norm": 0.24424545466899872, "learning_rate": 2.460297691472598e-05, "loss": 0.0344, "num_input_tokens_seen": 23667616, "step": 23525 }, { "epoch": 11.093823668081093, "grad_norm": 1.2736883163452148, "learning_rate": 2.4592692241237217e-05, "loss": 0.0406, "num_input_tokens_seen": 23672576, "step": 23530 }, { "epoch": 11.096181046676096, "grad_norm": 0.11464616656303406, "learning_rate": 2.458240763669874e-05, "loss": 0.1864, "num_input_tokens_seen": 23677280, "step": 23535 }, { "epoch": 11.098538425271098, "grad_norm": 0.02711058035492897, "learning_rate": 2.457212310285157e-05, "loss": 0.1288, "num_input_tokens_seen": 23681728, "step": 23540 }, { "epoch": 11.1008958038661, "grad_norm": 0.03394826129078865, "learning_rate": 2.4561838641436685e-05, "loss": 0.0464, "num_input_tokens_seen": 23687584, "step": 23545 }, { "epoch": 11.103253182461103, "grad_norm": 0.5054582953453064, "learning_rate": 2.455155425419508e-05, "loss": 0.0261, "num_input_tokens_seen": 23692480, "step": 23550 }, { "epoch": 11.105610561056105, "grad_norm": 0.7036149501800537, "learning_rate": 2.4541269942867722e-05, "loss": 0.0756, "num_input_tokens_seen": 23699040, "step": 23555 }, { "epoch": 11.107967939651108, "grad_norm": 2.545407295227051, "learning_rate": 2.4530985709195582e-05, "loss": 0.2345, "num_input_tokens_seen": 23705408, "step": 23560 }, { "epoch": 11.11032531824611, "grad_norm": 0.3790806233882904, "learning_rate": 2.4520701554919595e-05, "loss": 0.0407, "num_input_tokens_seen": 23710688, "step": 23565 }, { "epoch": 11.112682696841112, "grad_norm": 0.2798534631729126, "learning_rate": 2.4510417481780697e-05, "loss": 0.2394, "num_input_tokens_seen": 23715648, "step": 23570 }, { "epoch": 11.115040075436115, "grad_norm": 1.8083866834640503, "learning_rate": 2.4500133491519802e-05, "loss": 0.1251, "num_input_tokens_seen": 23720064, "step": 23575 }, { "epoch": 11.117397454031117, "grad_norm": 1.359642744064331, "learning_rate": 2.4489849585877834e-05, "loss": 0.2226, "num_input_tokens_seen": 23725280, "step": 23580 }, { "epoch": 11.11975483262612, "grad_norm": 1.6063779592514038, "learning_rate": 2.4479565766595656e-05, "loss": 0.2203, "num_input_tokens_seen": 23729408, "step": 23585 }, { "epoch": 11.122112211221122, "grad_norm": 0.36020204424858093, "learning_rate": 2.4469282035414163e-05, "loss": 0.124, "num_input_tokens_seen": 23734368, "step": 23590 }, { "epoch": 11.124469589816124, "grad_norm": 1.6061781644821167, "learning_rate": 2.4458998394074207e-05, "loss": 0.2403, "num_input_tokens_seen": 23738656, "step": 23595 }, { "epoch": 11.126826968411127, "grad_norm": 0.7048895955085754, "learning_rate": 2.4448714844316635e-05, "loss": 0.1577, "num_input_tokens_seen": 23744192, "step": 23600 }, { "epoch": 11.12918434700613, "grad_norm": 0.41572120785713196, "learning_rate": 2.443843138788228e-05, "loss": 0.0662, "num_input_tokens_seen": 23749760, "step": 23605 }, { "epoch": 11.131541725601132, "grad_norm": 1.2152204513549805, "learning_rate": 2.4428148026511964e-05, "loss": 0.0864, "num_input_tokens_seen": 23754592, "step": 23610 }, { "epoch": 11.133899104196134, "grad_norm": 2.0152647495269775, "learning_rate": 2.441786476194647e-05, "loss": 0.1282, "num_input_tokens_seen": 23759104, "step": 23615 }, { "epoch": 11.136256482791136, "grad_norm": 0.022905338555574417, "learning_rate": 2.4407581595926586e-05, "loss": 0.0395, "num_input_tokens_seen": 23764064, "step": 23620 }, { "epoch": 11.138613861386139, "grad_norm": 1.7847949266433716, "learning_rate": 2.439729853019308e-05, "loss": 0.0928, "num_input_tokens_seen": 23768992, "step": 23625 }, { "epoch": 11.140971239981141, "grad_norm": 0.07846009731292725, "learning_rate": 2.4387015566486707e-05, "loss": 0.0789, "num_input_tokens_seen": 23773984, "step": 23630 }, { "epoch": 11.143328618576144, "grad_norm": 0.9552448987960815, "learning_rate": 2.4376732706548183e-05, "loss": 0.0746, "num_input_tokens_seen": 23778848, "step": 23635 }, { "epoch": 11.145685997171146, "grad_norm": 0.07876516133546829, "learning_rate": 2.436644995211823e-05, "loss": 0.1991, "num_input_tokens_seen": 23785440, "step": 23640 }, { "epoch": 11.148043375766148, "grad_norm": 0.8425602912902832, "learning_rate": 2.4356167304937545e-05, "loss": 0.1213, "num_input_tokens_seen": 23789696, "step": 23645 }, { "epoch": 11.15040075436115, "grad_norm": 0.10777195543050766, "learning_rate": 2.4345884766746807e-05, "loss": 0.0507, "num_input_tokens_seen": 23793888, "step": 23650 }, { "epoch": 11.152758132956153, "grad_norm": 1.9650683403015137, "learning_rate": 2.4335602339286667e-05, "loss": 0.0928, "num_input_tokens_seen": 23799328, "step": 23655 }, { "epoch": 11.155115511551156, "grad_norm": 2.485038995742798, "learning_rate": 2.432532002429777e-05, "loss": 0.2056, "num_input_tokens_seen": 23805152, "step": 23660 }, { "epoch": 11.157472890146158, "grad_norm": 1.064167857170105, "learning_rate": 2.431503782352074e-05, "loss": 0.0917, "num_input_tokens_seen": 23809568, "step": 23665 }, { "epoch": 11.15983026874116, "grad_norm": 0.449677973985672, "learning_rate": 2.4304755738696178e-05, "loss": 0.1245, "num_input_tokens_seen": 23814016, "step": 23670 }, { "epoch": 11.162187647336163, "grad_norm": 1.6192253828048706, "learning_rate": 2.4294473771564662e-05, "loss": 0.2122, "num_input_tokens_seen": 23818368, "step": 23675 }, { "epoch": 11.164545025931165, "grad_norm": 0.9854294061660767, "learning_rate": 2.4284191923866756e-05, "loss": 0.0663, "num_input_tokens_seen": 23823424, "step": 23680 }, { "epoch": 11.166902404526168, "grad_norm": 0.5283966660499573, "learning_rate": 2.4273910197343002e-05, "loss": 0.0457, "num_input_tokens_seen": 23827936, "step": 23685 }, { "epoch": 11.16925978312117, "grad_norm": 0.3463340997695923, "learning_rate": 2.426362859373393e-05, "loss": 0.2383, "num_input_tokens_seen": 23832736, "step": 23690 }, { "epoch": 11.171617161716172, "grad_norm": 0.32223066687583923, "learning_rate": 2.425334711478003e-05, "loss": 0.1229, "num_input_tokens_seen": 23837920, "step": 23695 }, { "epoch": 11.173974540311175, "grad_norm": 1.1887969970703125, "learning_rate": 2.4243065762221787e-05, "loss": 0.1197, "num_input_tokens_seen": 23842560, "step": 23700 }, { "epoch": 11.176331918906177, "grad_norm": 0.3156804144382477, "learning_rate": 2.4232784537799655e-05, "loss": 0.035, "num_input_tokens_seen": 23847200, "step": 23705 }, { "epoch": 11.17868929750118, "grad_norm": 1.1877974271774292, "learning_rate": 2.422250344325408e-05, "loss": 0.0793, "num_input_tokens_seen": 23851456, "step": 23710 }, { "epoch": 11.18104667609618, "grad_norm": 0.12059023976325989, "learning_rate": 2.421222248032547e-05, "loss": 0.1634, "num_input_tokens_seen": 23857056, "step": 23715 }, { "epoch": 11.183404054691183, "grad_norm": 0.12835286557674408, "learning_rate": 2.4201941650754215e-05, "loss": 0.0651, "num_input_tokens_seen": 23863456, "step": 23720 }, { "epoch": 11.185761433286185, "grad_norm": 0.32904139161109924, "learning_rate": 2.4191660956280687e-05, "loss": 0.0419, "num_input_tokens_seen": 23867392, "step": 23725 }, { "epoch": 11.188118811881187, "grad_norm": 0.4672934412956238, "learning_rate": 2.418138039864524e-05, "loss": 0.0253, "num_input_tokens_seen": 23872128, "step": 23730 }, { "epoch": 11.19047619047619, "grad_norm": 0.6861374974250793, "learning_rate": 2.417109997958819e-05, "loss": 0.0284, "num_input_tokens_seen": 23876448, "step": 23735 }, { "epoch": 11.192833569071192, "grad_norm": 2.43845534324646, "learning_rate": 2.4160819700849836e-05, "loss": 0.3254, "num_input_tokens_seen": 23881536, "step": 23740 }, { "epoch": 11.195190947666195, "grad_norm": 1.5259881019592285, "learning_rate": 2.415053956417046e-05, "loss": 0.2391, "num_input_tokens_seen": 23886816, "step": 23745 }, { "epoch": 11.197548326261197, "grad_norm": 0.01941388100385666, "learning_rate": 2.414025957129032e-05, "loss": 0.2549, "num_input_tokens_seen": 23891360, "step": 23750 }, { "epoch": 11.1999057048562, "grad_norm": 0.4715493321418762, "learning_rate": 2.4129979723949628e-05, "loss": 0.0553, "num_input_tokens_seen": 23896864, "step": 23755 }, { "epoch": 11.202263083451202, "grad_norm": 2.1050243377685547, "learning_rate": 2.4119700023888598e-05, "loss": 0.2131, "num_input_tokens_seen": 23903616, "step": 23760 }, { "epoch": 11.204620462046204, "grad_norm": 1.4223973751068115, "learning_rate": 2.4109420472847406e-05, "loss": 0.2682, "num_input_tokens_seen": 23908064, "step": 23765 }, { "epoch": 11.206977840641207, "grad_norm": 0.654634416103363, "learning_rate": 2.4099141072566214e-05, "loss": 0.0607, "num_input_tokens_seen": 23913472, "step": 23770 }, { "epoch": 11.209335219236209, "grad_norm": 0.8870623111724854, "learning_rate": 2.4088861824785135e-05, "loss": 0.1411, "num_input_tokens_seen": 23917760, "step": 23775 }, { "epoch": 11.211692597831211, "grad_norm": 0.25766509771347046, "learning_rate": 2.4078582731244283e-05, "loss": 0.0481, "num_input_tokens_seen": 23923744, "step": 23780 }, { "epoch": 11.214049976426214, "grad_norm": 0.1411857306957245, "learning_rate": 2.406830379368373e-05, "loss": 0.1102, "num_input_tokens_seen": 23928896, "step": 23785 }, { "epoch": 11.216407355021216, "grad_norm": 1.624316930770874, "learning_rate": 2.4058025013843524e-05, "loss": 0.2966, "num_input_tokens_seen": 23935168, "step": 23790 }, { "epoch": 11.218764733616219, "grad_norm": 0.3274618089199066, "learning_rate": 2.404774639346369e-05, "loss": 0.0764, "num_input_tokens_seen": 23939904, "step": 23795 }, { "epoch": 11.221122112211221, "grad_norm": 0.5801093578338623, "learning_rate": 2.403746793428422e-05, "loss": 0.1044, "num_input_tokens_seen": 23943808, "step": 23800 }, { "epoch": 11.223479490806223, "grad_norm": 1.3351057767868042, "learning_rate": 2.40271896380451e-05, "loss": 0.2186, "num_input_tokens_seen": 23948096, "step": 23805 }, { "epoch": 11.225836869401226, "grad_norm": 0.234411358833313, "learning_rate": 2.401691150648624e-05, "loss": 0.1357, "num_input_tokens_seen": 23953536, "step": 23810 }, { "epoch": 11.228194247996228, "grad_norm": 0.506365180015564, "learning_rate": 2.4006633541347578e-05, "loss": 0.0539, "num_input_tokens_seen": 23958528, "step": 23815 }, { "epoch": 11.23055162659123, "grad_norm": 0.3679274618625641, "learning_rate": 2.3996355744368993e-05, "loss": 0.0991, "num_input_tokens_seen": 23964640, "step": 23820 }, { "epoch": 11.232909005186233, "grad_norm": 0.10661102831363678, "learning_rate": 2.3986078117290333e-05, "loss": 0.0607, "num_input_tokens_seen": 23969920, "step": 23825 }, { "epoch": 11.235266383781235, "grad_norm": 0.09435459971427917, "learning_rate": 2.397580066185143e-05, "loss": 0.1431, "num_input_tokens_seen": 23974944, "step": 23830 }, { "epoch": 11.237623762376238, "grad_norm": 2.380441665649414, "learning_rate": 2.396552337979209e-05, "loss": 0.1077, "num_input_tokens_seen": 23980032, "step": 23835 }, { "epoch": 11.23998114097124, "grad_norm": 0.13635633885860443, "learning_rate": 2.3955246272852084e-05, "loss": 0.1, "num_input_tokens_seen": 23985248, "step": 23840 }, { "epoch": 11.242338519566243, "grad_norm": 0.3206653892993927, "learning_rate": 2.3944969342771134e-05, "loss": 0.1243, "num_input_tokens_seen": 23990592, "step": 23845 }, { "epoch": 11.244695898161245, "grad_norm": 0.15307900309562683, "learning_rate": 2.3934692591288965e-05, "loss": 0.0748, "num_input_tokens_seen": 23995584, "step": 23850 }, { "epoch": 11.247053276756247, "grad_norm": 0.2612878382205963, "learning_rate": 2.392441602014525e-05, "loss": 0.2186, "num_input_tokens_seen": 24001248, "step": 23855 }, { "epoch": 11.24941065535125, "grad_norm": 0.218190535902977, "learning_rate": 2.3914139631079646e-05, "loss": 0.1385, "num_input_tokens_seen": 24006336, "step": 23860 }, { "epoch": 11.251768033946252, "grad_norm": 0.9735785722732544, "learning_rate": 2.3903863425831762e-05, "loss": 0.0941, "num_input_tokens_seen": 24010656, "step": 23865 }, { "epoch": 11.254125412541255, "grad_norm": 1.6675745248794556, "learning_rate": 2.3893587406141187e-05, "loss": 0.2133, "num_input_tokens_seen": 24016480, "step": 23870 }, { "epoch": 11.256482791136257, "grad_norm": 0.44183140993118286, "learning_rate": 2.388331157374748e-05, "loss": 0.1844, "num_input_tokens_seen": 24020864, "step": 23875 }, { "epoch": 11.25884016973126, "grad_norm": 0.4090074598789215, "learning_rate": 2.3873035930390167e-05, "loss": 0.067, "num_input_tokens_seen": 24025600, "step": 23880 }, { "epoch": 11.261197548326262, "grad_norm": 2.083183526992798, "learning_rate": 2.3862760477808733e-05, "loss": 0.1973, "num_input_tokens_seen": 24033440, "step": 23885 }, { "epoch": 11.263554926921264, "grad_norm": 1.0961673259735107, "learning_rate": 2.3852485217742638e-05, "loss": 0.1272, "num_input_tokens_seen": 24040992, "step": 23890 }, { "epoch": 11.265912305516267, "grad_norm": 1.0515834093093872, "learning_rate": 2.384221015193131e-05, "loss": 0.1983, "num_input_tokens_seen": 24047136, "step": 23895 }, { "epoch": 11.268269684111269, "grad_norm": 0.15422482788562775, "learning_rate": 2.3831935282114156e-05, "loss": 0.018, "num_input_tokens_seen": 24051584, "step": 23900 }, { "epoch": 11.270627062706271, "grad_norm": 0.7067266702651978, "learning_rate": 2.3821660610030523e-05, "loss": 0.0343, "num_input_tokens_seen": 24056000, "step": 23905 }, { "epoch": 11.272984441301272, "grad_norm": 2.468043088912964, "learning_rate": 2.3811386137419734e-05, "loss": 0.2219, "num_input_tokens_seen": 24061664, "step": 23910 }, { "epoch": 11.275341819896274, "grad_norm": 0.9255743622779846, "learning_rate": 2.3801111866021093e-05, "loss": 0.1241, "num_input_tokens_seen": 24067392, "step": 23915 }, { "epoch": 11.277699198491277, "grad_norm": 1.864259123802185, "learning_rate": 2.3790837797573864e-05, "loss": 0.0997, "num_input_tokens_seen": 24073184, "step": 23920 }, { "epoch": 11.28005657708628, "grad_norm": 0.020893430337309837, "learning_rate": 2.378056393381726e-05, "loss": 0.1209, "num_input_tokens_seen": 24077504, "step": 23925 }, { "epoch": 11.282413955681282, "grad_norm": 0.03417985513806343, "learning_rate": 2.3770290276490474e-05, "loss": 0.0718, "num_input_tokens_seen": 24082784, "step": 23930 }, { "epoch": 11.284771334276284, "grad_norm": 0.3450435996055603, "learning_rate": 2.3760016827332672e-05, "loss": 0.1603, "num_input_tokens_seen": 24087136, "step": 23935 }, { "epoch": 11.287128712871286, "grad_norm": 0.5038905143737793, "learning_rate": 2.3749743588082966e-05, "loss": 0.1812, "num_input_tokens_seen": 24092832, "step": 23940 }, { "epoch": 11.289486091466289, "grad_norm": 0.8856201767921448, "learning_rate": 2.373947056048044e-05, "loss": 0.0545, "num_input_tokens_seen": 24097952, "step": 23945 }, { "epoch": 11.291843470061291, "grad_norm": 0.8755683302879333, "learning_rate": 2.3729197746264143e-05, "loss": 0.1133, "num_input_tokens_seen": 24102880, "step": 23950 }, { "epoch": 11.294200848656294, "grad_norm": 0.1280658096075058, "learning_rate": 2.3718925147173092e-05, "loss": 0.0368, "num_input_tokens_seen": 24108128, "step": 23955 }, { "epoch": 11.296558227251296, "grad_norm": 0.25542333722114563, "learning_rate": 2.3708652764946267e-05, "loss": 0.1608, "num_input_tokens_seen": 24113568, "step": 23960 }, { "epoch": 11.298915605846299, "grad_norm": 0.3191545903682709, "learning_rate": 2.36983806013226e-05, "loss": 0.2007, "num_input_tokens_seen": 24118560, "step": 23965 }, { "epoch": 11.301272984441301, "grad_norm": 1.11915123462677, "learning_rate": 2.3688108658040994e-05, "loss": 0.0636, "num_input_tokens_seen": 24123584, "step": 23970 }, { "epoch": 11.303630363036303, "grad_norm": 1.0530619621276855, "learning_rate": 2.367783693684032e-05, "loss": 0.1158, "num_input_tokens_seen": 24128192, "step": 23975 }, { "epoch": 11.305987741631306, "grad_norm": 0.0728260800242424, "learning_rate": 2.3667565439459407e-05, "loss": 0.0478, "num_input_tokens_seen": 24132384, "step": 23980 }, { "epoch": 11.308345120226308, "grad_norm": 0.9105410575866699, "learning_rate": 2.3657294167637038e-05, "loss": 0.062, "num_input_tokens_seen": 24138240, "step": 23985 }, { "epoch": 11.31070249882131, "grad_norm": 0.11450576037168503, "learning_rate": 2.3647023123111965e-05, "loss": 0.0296, "num_input_tokens_seen": 24143200, "step": 23990 }, { "epoch": 11.313059877416313, "grad_norm": 1.6341094970703125, "learning_rate": 2.363675230762291e-05, "loss": 0.1698, "num_input_tokens_seen": 24148256, "step": 23995 }, { "epoch": 11.315417256011315, "grad_norm": 0.20199398696422577, "learning_rate": 2.3626481722908546e-05, "loss": 0.068, "num_input_tokens_seen": 24152608, "step": 24000 }, { "epoch": 11.317774634606318, "grad_norm": 2.1090002059936523, "learning_rate": 2.36162113707075e-05, "loss": 0.1743, "num_input_tokens_seen": 24156800, "step": 24005 }, { "epoch": 11.32013201320132, "grad_norm": 0.05474419146776199, "learning_rate": 2.3605941252758375e-05, "loss": 0.0697, "num_input_tokens_seen": 24161984, "step": 24010 }, { "epoch": 11.322489391796323, "grad_norm": 0.23365235328674316, "learning_rate": 2.3595671370799724e-05, "loss": 0.1099, "num_input_tokens_seen": 24166048, "step": 24015 }, { "epoch": 11.324846770391325, "grad_norm": 1.6149978637695312, "learning_rate": 2.358540172657007e-05, "loss": 0.1317, "num_input_tokens_seen": 24171232, "step": 24020 }, { "epoch": 11.327204148986327, "grad_norm": 0.08992253988981247, "learning_rate": 2.3575132321807883e-05, "loss": 0.2057, "num_input_tokens_seen": 24175168, "step": 24025 }, { "epoch": 11.32956152758133, "grad_norm": 0.8667755126953125, "learning_rate": 2.356486315825161e-05, "loss": 0.1312, "num_input_tokens_seen": 24180224, "step": 24030 }, { "epoch": 11.331918906176332, "grad_norm": 0.05208795890212059, "learning_rate": 2.3554594237639627e-05, "loss": 0.1268, "num_input_tokens_seen": 24185600, "step": 24035 }, { "epoch": 11.334276284771335, "grad_norm": 0.08676210045814514, "learning_rate": 2.3544325561710295e-05, "loss": 0.2121, "num_input_tokens_seen": 24190656, "step": 24040 }, { "epoch": 11.336633663366337, "grad_norm": 0.17138053476810455, "learning_rate": 2.3534057132201932e-05, "loss": 0.0879, "num_input_tokens_seen": 24194688, "step": 24045 }, { "epoch": 11.33899104196134, "grad_norm": 1.3415265083312988, "learning_rate": 2.3523788950852814e-05, "loss": 0.2109, "num_input_tokens_seen": 24200928, "step": 24050 }, { "epoch": 11.341348420556342, "grad_norm": 0.4919356107711792, "learning_rate": 2.3513521019401153e-05, "loss": 0.1955, "num_input_tokens_seen": 24204896, "step": 24055 }, { "epoch": 11.343705799151344, "grad_norm": 0.159027099609375, "learning_rate": 2.3503253339585145e-05, "loss": 0.0246, "num_input_tokens_seen": 24209856, "step": 24060 }, { "epoch": 11.346063177746347, "grad_norm": 1.2158042192459106, "learning_rate": 2.3492985913142933e-05, "loss": 0.1479, "num_input_tokens_seen": 24216640, "step": 24065 }, { "epoch": 11.348420556341349, "grad_norm": 0.43910056352615356, "learning_rate": 2.3482718741812625e-05, "loss": 0.0315, "num_input_tokens_seen": 24221088, "step": 24070 }, { "epoch": 11.350777934936351, "grad_norm": 0.8752781748771667, "learning_rate": 2.3472451827332267e-05, "loss": 0.145, "num_input_tokens_seen": 24225728, "step": 24075 }, { "epoch": 11.353135313531354, "grad_norm": 0.45982375741004944, "learning_rate": 2.3462185171439878e-05, "loss": 0.0407, "num_input_tokens_seen": 24231008, "step": 24080 }, { "epoch": 11.355492692126356, "grad_norm": 0.3343949615955353, "learning_rate": 2.3451918775873426e-05, "loss": 0.0713, "num_input_tokens_seen": 24236608, "step": 24085 }, { "epoch": 11.357850070721359, "grad_norm": 0.19783398509025574, "learning_rate": 2.3441652642370853e-05, "loss": 0.0662, "num_input_tokens_seen": 24242240, "step": 24090 }, { "epoch": 11.360207449316361, "grad_norm": 0.5878002047538757, "learning_rate": 2.3431386772670013e-05, "loss": 0.1972, "num_input_tokens_seen": 24249024, "step": 24095 }, { "epoch": 11.362564827911363, "grad_norm": 0.33660927414894104, "learning_rate": 2.3421121168508768e-05, "loss": 0.0816, "num_input_tokens_seen": 24254400, "step": 24100 }, { "epoch": 11.364922206506366, "grad_norm": 0.4062296748161316, "learning_rate": 2.3410855831624896e-05, "loss": 0.0485, "num_input_tokens_seen": 24258752, "step": 24105 }, { "epoch": 11.367279585101368, "grad_norm": 1.4599734544754028, "learning_rate": 2.3400590763756154e-05, "loss": 0.1364, "num_input_tokens_seen": 24263616, "step": 24110 }, { "epoch": 11.369636963696369, "grad_norm": 1.5280355215072632, "learning_rate": 2.3390325966640237e-05, "loss": 0.1211, "num_input_tokens_seen": 24269792, "step": 24115 }, { "epoch": 11.371994342291371, "grad_norm": 0.9372276067733765, "learning_rate": 2.3380061442014802e-05, "loss": 0.1762, "num_input_tokens_seen": 24275360, "step": 24120 }, { "epoch": 11.374351720886374, "grad_norm": 0.016108408570289612, "learning_rate": 2.336979719161746e-05, "loss": 0.0766, "num_input_tokens_seen": 24280096, "step": 24125 }, { "epoch": 11.376709099481376, "grad_norm": 0.20102347433567047, "learning_rate": 2.335953321718578e-05, "loss": 0.0304, "num_input_tokens_seen": 24284608, "step": 24130 }, { "epoch": 11.379066478076378, "grad_norm": 1.746261715888977, "learning_rate": 2.334926952045727e-05, "loss": 0.077, "num_input_tokens_seen": 24288704, "step": 24135 }, { "epoch": 11.38142385667138, "grad_norm": 0.5283724665641785, "learning_rate": 2.3339006103169397e-05, "loss": 0.1272, "num_input_tokens_seen": 24292960, "step": 24140 }, { "epoch": 11.383781235266383, "grad_norm": 2.1911003589630127, "learning_rate": 2.332874296705959e-05, "loss": 0.1616, "num_input_tokens_seen": 24298176, "step": 24145 }, { "epoch": 11.386138613861386, "grad_norm": 1.1882025003433228, "learning_rate": 2.3318480113865232e-05, "loss": 0.0894, "num_input_tokens_seen": 24302848, "step": 24150 }, { "epoch": 11.388495992456388, "grad_norm": 2.2944040298461914, "learning_rate": 2.3308217545323628e-05, "loss": 0.1239, "num_input_tokens_seen": 24307296, "step": 24155 }, { "epoch": 11.39085337105139, "grad_norm": 0.8086540699005127, "learning_rate": 2.3297955263172076e-05, "loss": 0.0676, "num_input_tokens_seen": 24312608, "step": 24160 }, { "epoch": 11.393210749646393, "grad_norm": 0.18246786296367645, "learning_rate": 2.3287693269147794e-05, "loss": 0.0595, "num_input_tokens_seen": 24317024, "step": 24165 }, { "epoch": 11.395568128241395, "grad_norm": 1.228733777999878, "learning_rate": 2.3277431564987974e-05, "loss": 0.1237, "num_input_tokens_seen": 24321536, "step": 24170 }, { "epoch": 11.397925506836398, "grad_norm": 0.7724920511245728, "learning_rate": 2.3267170152429736e-05, "loss": 0.0973, "num_input_tokens_seen": 24325536, "step": 24175 }, { "epoch": 11.4002828854314, "grad_norm": 1.2803630828857422, "learning_rate": 2.3256909033210175e-05, "loss": 0.1596, "num_input_tokens_seen": 24330144, "step": 24180 }, { "epoch": 11.402640264026402, "grad_norm": 0.010471051558852196, "learning_rate": 2.3246648209066312e-05, "loss": 0.2101, "num_input_tokens_seen": 24334752, "step": 24185 }, { "epoch": 11.404997642621405, "grad_norm": 0.17585332691669464, "learning_rate": 2.3236387681735146e-05, "loss": 0.0178, "num_input_tokens_seen": 24339712, "step": 24190 }, { "epoch": 11.407355021216407, "grad_norm": 0.829646110534668, "learning_rate": 2.3226127452953598e-05, "loss": 0.1003, "num_input_tokens_seen": 24344704, "step": 24195 }, { "epoch": 11.40971239981141, "grad_norm": 1.0665991306304932, "learning_rate": 2.321586752445855e-05, "loss": 0.0352, "num_input_tokens_seen": 24351040, "step": 24200 }, { "epoch": 11.412069778406412, "grad_norm": 0.28435319662094116, "learning_rate": 2.3205607897986837e-05, "loss": 0.1186, "num_input_tokens_seen": 24356224, "step": 24205 }, { "epoch": 11.414427157001414, "grad_norm": 1.0299766063690186, "learning_rate": 2.3195348575275246e-05, "loss": 0.0367, "num_input_tokens_seen": 24361216, "step": 24210 }, { "epoch": 11.416784535596417, "grad_norm": 0.04198412224650383, "learning_rate": 2.3185089558060497e-05, "loss": 0.0255, "num_input_tokens_seen": 24366880, "step": 24215 }, { "epoch": 11.41914191419142, "grad_norm": 0.05868959426879883, "learning_rate": 2.3174830848079266e-05, "loss": 0.1074, "num_input_tokens_seen": 24372000, "step": 24220 }, { "epoch": 11.421499292786422, "grad_norm": 0.06919743865728378, "learning_rate": 2.3164572447068186e-05, "loss": 0.0389, "num_input_tokens_seen": 24376160, "step": 24225 }, { "epoch": 11.423856671381424, "grad_norm": 1.2805875539779663, "learning_rate": 2.315431435676383e-05, "loss": 0.2016, "num_input_tokens_seen": 24381088, "step": 24230 }, { "epoch": 11.426214049976426, "grad_norm": 0.5367804169654846, "learning_rate": 2.3144056578902713e-05, "loss": 0.0156, "num_input_tokens_seen": 24387136, "step": 24235 }, { "epoch": 11.428571428571429, "grad_norm": 0.14116333425045013, "learning_rate": 2.31337991152213e-05, "loss": 0.0588, "num_input_tokens_seen": 24394016, "step": 24240 }, { "epoch": 11.430928807166431, "grad_norm": 0.9939568638801575, "learning_rate": 2.3123541967456016e-05, "loss": 0.0523, "num_input_tokens_seen": 24399872, "step": 24245 }, { "epoch": 11.433286185761434, "grad_norm": 0.036571066826581955, "learning_rate": 2.311328513734322e-05, "loss": 0.0799, "num_input_tokens_seen": 24404864, "step": 24250 }, { "epoch": 11.435643564356436, "grad_norm": 0.0875057503581047, "learning_rate": 2.3103028626619222e-05, "loss": 0.1161, "num_input_tokens_seen": 24409280, "step": 24255 }, { "epoch": 11.438000942951438, "grad_norm": 0.5983738303184509, "learning_rate": 2.3092772437020256e-05, "loss": 0.0427, "num_input_tokens_seen": 24413792, "step": 24260 }, { "epoch": 11.44035832154644, "grad_norm": 0.008171312510967255, "learning_rate": 2.3082516570282535e-05, "loss": 0.1381, "num_input_tokens_seen": 24418592, "step": 24265 }, { "epoch": 11.442715700141443, "grad_norm": 0.30942556262016296, "learning_rate": 2.30722610281422e-05, "loss": 0.1086, "num_input_tokens_seen": 24423296, "step": 24270 }, { "epoch": 11.445073078736446, "grad_norm": 0.3127667009830475, "learning_rate": 2.3062005812335348e-05, "loss": 0.3978, "num_input_tokens_seen": 24428032, "step": 24275 }, { "epoch": 11.447430457331448, "grad_norm": 0.8903291821479797, "learning_rate": 2.3051750924598002e-05, "loss": 0.1738, "num_input_tokens_seen": 24433792, "step": 24280 }, { "epoch": 11.44978783592645, "grad_norm": 0.23815664649009705, "learning_rate": 2.304149636666614e-05, "loss": 0.036, "num_input_tokens_seen": 24439520, "step": 24285 }, { "epoch": 11.452145214521453, "grad_norm": 0.6676105260848999, "learning_rate": 2.30312421402757e-05, "loss": 0.2354, "num_input_tokens_seen": 24443232, "step": 24290 }, { "epoch": 11.454502593116455, "grad_norm": 0.5950478315353394, "learning_rate": 2.3020988247162536e-05, "loss": 0.0356, "num_input_tokens_seen": 24447744, "step": 24295 }, { "epoch": 11.456859971711458, "grad_norm": 0.08322751522064209, "learning_rate": 2.3010734689062457e-05, "loss": 0.1489, "num_input_tokens_seen": 24451872, "step": 24300 }, { "epoch": 11.45921735030646, "grad_norm": 2.624539852142334, "learning_rate": 2.300048146771122e-05, "loss": 0.2179, "num_input_tokens_seen": 24457472, "step": 24305 }, { "epoch": 11.46157472890146, "grad_norm": 0.43408769369125366, "learning_rate": 2.2990228584844516e-05, "loss": 0.0726, "num_input_tokens_seen": 24463296, "step": 24310 }, { "epoch": 11.463932107496463, "grad_norm": 0.031854767352342606, "learning_rate": 2.2979976042197997e-05, "loss": 0.0551, "num_input_tokens_seen": 24468704, "step": 24315 }, { "epoch": 11.466289486091465, "grad_norm": 1.2538307905197144, "learning_rate": 2.296972384150723e-05, "loss": 0.1777, "num_input_tokens_seen": 24474496, "step": 24320 }, { "epoch": 11.468646864686468, "grad_norm": 0.3767624795436859, "learning_rate": 2.2959471984507746e-05, "loss": 0.1685, "num_input_tokens_seen": 24478944, "step": 24325 }, { "epoch": 11.47100424328147, "grad_norm": 0.36862000823020935, "learning_rate": 2.294922047293501e-05, "loss": 0.3608, "num_input_tokens_seen": 24484672, "step": 24330 }, { "epoch": 11.473361621876473, "grad_norm": 0.4591468274593353, "learning_rate": 2.2938969308524433e-05, "loss": 0.1256, "num_input_tokens_seen": 24489184, "step": 24335 }, { "epoch": 11.475719000471475, "grad_norm": 0.25539398193359375, "learning_rate": 2.2928718493011348e-05, "loss": 0.0305, "num_input_tokens_seen": 24493632, "step": 24340 }, { "epoch": 11.478076379066477, "grad_norm": 1.1386921405792236, "learning_rate": 2.291846802813106e-05, "loss": 0.2899, "num_input_tokens_seen": 24499552, "step": 24345 }, { "epoch": 11.48043375766148, "grad_norm": 0.2727643549442291, "learning_rate": 2.2908217915618786e-05, "loss": 0.0606, "num_input_tokens_seen": 24504256, "step": 24350 }, { "epoch": 11.482791136256482, "grad_norm": 0.5085097551345825, "learning_rate": 2.289796815720971e-05, "loss": 0.1472, "num_input_tokens_seen": 24508480, "step": 24355 }, { "epoch": 11.485148514851485, "grad_norm": 0.7577383518218994, "learning_rate": 2.2887718754638933e-05, "loss": 0.0462, "num_input_tokens_seen": 24512992, "step": 24360 }, { "epoch": 11.487505893446487, "grad_norm": 1.780502200126648, "learning_rate": 2.28774697096415e-05, "loss": 0.0932, "num_input_tokens_seen": 24519872, "step": 24365 }, { "epoch": 11.48986327204149, "grad_norm": 1.6886084079742432, "learning_rate": 2.286722102395241e-05, "loss": 0.1369, "num_input_tokens_seen": 24524544, "step": 24370 }, { "epoch": 11.492220650636492, "grad_norm": 1.421913743019104, "learning_rate": 2.2856972699306595e-05, "loss": 0.1323, "num_input_tokens_seen": 24530368, "step": 24375 }, { "epoch": 11.494578029231494, "grad_norm": 0.506259560585022, "learning_rate": 2.2846724737438907e-05, "loss": 0.052, "num_input_tokens_seen": 24534784, "step": 24380 }, { "epoch": 11.496935407826497, "grad_norm": 0.6530354619026184, "learning_rate": 2.283647714008416e-05, "loss": 0.2704, "num_input_tokens_seen": 24539712, "step": 24385 }, { "epoch": 11.499292786421499, "grad_norm": 0.3365597724914551, "learning_rate": 2.2826229908977105e-05, "loss": 0.0331, "num_input_tokens_seen": 24544512, "step": 24390 }, { "epoch": 11.501650165016502, "grad_norm": 0.04617554694414139, "learning_rate": 2.2815983045852414e-05, "loss": 0.0829, "num_input_tokens_seen": 24550464, "step": 24395 }, { "epoch": 11.504007543611504, "grad_norm": 0.35726433992385864, "learning_rate": 2.2805736552444713e-05, "loss": 0.1987, "num_input_tokens_seen": 24555200, "step": 24400 }, { "epoch": 11.506364922206506, "grad_norm": 1.1919203996658325, "learning_rate": 2.2795490430488552e-05, "loss": 0.1517, "num_input_tokens_seen": 24560416, "step": 24405 }, { "epoch": 11.508722300801509, "grad_norm": 0.6362048387527466, "learning_rate": 2.2785244681718435e-05, "loss": 0.2632, "num_input_tokens_seen": 24564544, "step": 24410 }, { "epoch": 11.511079679396511, "grad_norm": 0.7258095741271973, "learning_rate": 2.277499930786879e-05, "loss": 0.0887, "num_input_tokens_seen": 24570784, "step": 24415 }, { "epoch": 11.513437057991514, "grad_norm": 0.5612152814865112, "learning_rate": 2.2764754310673982e-05, "loss": 0.0589, "num_input_tokens_seen": 24575392, "step": 24420 }, { "epoch": 11.515794436586516, "grad_norm": 0.5172460675239563, "learning_rate": 2.2754509691868317e-05, "loss": 0.1879, "num_input_tokens_seen": 24580064, "step": 24425 }, { "epoch": 11.518151815181518, "grad_norm": 0.28085488080978394, "learning_rate": 2.2744265453186035e-05, "loss": 0.1006, "num_input_tokens_seen": 24585632, "step": 24430 }, { "epoch": 11.52050919377652, "grad_norm": 0.3379823863506317, "learning_rate": 2.2734021596361312e-05, "loss": 0.0462, "num_input_tokens_seen": 24590048, "step": 24435 }, { "epoch": 11.522866572371523, "grad_norm": 0.08645036071538925, "learning_rate": 2.2723778123128257e-05, "loss": 0.0573, "num_input_tokens_seen": 24595040, "step": 24440 }, { "epoch": 11.525223950966526, "grad_norm": 1.1790711879730225, "learning_rate": 2.2713535035220922e-05, "loss": 0.1907, "num_input_tokens_seen": 24601184, "step": 24445 }, { "epoch": 11.527581329561528, "grad_norm": 1.7724181413650513, "learning_rate": 2.2703292334373278e-05, "loss": 0.0603, "num_input_tokens_seen": 24605536, "step": 24450 }, { "epoch": 11.52993870815653, "grad_norm": 0.44504159688949585, "learning_rate": 2.269305002231925e-05, "loss": 0.031, "num_input_tokens_seen": 24610784, "step": 24455 }, { "epoch": 11.532296086751533, "grad_norm": 1.6727712154388428, "learning_rate": 2.2682808100792683e-05, "loss": 0.1234, "num_input_tokens_seen": 24615360, "step": 24460 }, { "epoch": 11.534653465346535, "grad_norm": 1.2184280157089233, "learning_rate": 2.2672566571527357e-05, "loss": 0.274, "num_input_tokens_seen": 24620992, "step": 24465 }, { "epoch": 11.537010843941538, "grad_norm": 0.10796233266592026, "learning_rate": 2.2662325436256994e-05, "loss": 0.009, "num_input_tokens_seen": 24626368, "step": 24470 }, { "epoch": 11.53936822253654, "grad_norm": 0.3292594254016876, "learning_rate": 2.2652084696715247e-05, "loss": 0.1456, "num_input_tokens_seen": 24632000, "step": 24475 }, { "epoch": 11.541725601131542, "grad_norm": 0.06513920426368713, "learning_rate": 2.2641844354635694e-05, "loss": 0.129, "num_input_tokens_seen": 24636480, "step": 24480 }, { "epoch": 11.544082979726545, "grad_norm": 0.8943133354187012, "learning_rate": 2.263160441175186e-05, "loss": 0.0394, "num_input_tokens_seen": 24641024, "step": 24485 }, { "epoch": 11.546440358321547, "grad_norm": 0.3080117106437683, "learning_rate": 2.262136486979718e-05, "loss": 0.1456, "num_input_tokens_seen": 24646080, "step": 24490 }, { "epoch": 11.54879773691655, "grad_norm": 0.1472013145685196, "learning_rate": 2.2611125730505035e-05, "loss": 0.0603, "num_input_tokens_seen": 24650304, "step": 24495 }, { "epoch": 11.551155115511552, "grad_norm": 0.6235584616661072, "learning_rate": 2.2600886995608744e-05, "loss": 0.1125, "num_input_tokens_seen": 24655168, "step": 24500 }, { "epoch": 11.553512494106554, "grad_norm": 0.0765698105096817, "learning_rate": 2.2590648666841556e-05, "loss": 0.1674, "num_input_tokens_seen": 24660576, "step": 24505 }, { "epoch": 11.555869872701557, "grad_norm": 0.6320438981056213, "learning_rate": 2.2580410745936636e-05, "loss": 0.0425, "num_input_tokens_seen": 24665184, "step": 24510 }, { "epoch": 11.558227251296557, "grad_norm": 1.7379961013793945, "learning_rate": 2.2570173234627095e-05, "loss": 0.1158, "num_input_tokens_seen": 24669408, "step": 24515 }, { "epoch": 11.56058462989156, "grad_norm": 1.9984604120254517, "learning_rate": 2.255993613464597e-05, "loss": 0.2407, "num_input_tokens_seen": 24674240, "step": 24520 }, { "epoch": 11.562942008486562, "grad_norm": 1.1132076978683472, "learning_rate": 2.254969944772623e-05, "loss": 0.1446, "num_input_tokens_seen": 24679616, "step": 24525 }, { "epoch": 11.565299387081565, "grad_norm": 0.37461674213409424, "learning_rate": 2.2539463175600764e-05, "loss": 0.1452, "num_input_tokens_seen": 24684128, "step": 24530 }, { "epoch": 11.567656765676567, "grad_norm": 0.8064287900924683, "learning_rate": 2.2529227320002405e-05, "loss": 0.1432, "num_input_tokens_seen": 24688320, "step": 24535 }, { "epoch": 11.57001414427157, "grad_norm": 0.5975262522697449, "learning_rate": 2.251899188266391e-05, "loss": 0.1094, "num_input_tokens_seen": 24694272, "step": 24540 }, { "epoch": 11.572371522866572, "grad_norm": 4.022890567779541, "learning_rate": 2.2508756865317973e-05, "loss": 0.1162, "num_input_tokens_seen": 24699520, "step": 24545 }, { "epoch": 11.574728901461574, "grad_norm": 0.10135461390018463, "learning_rate": 2.2498522269697193e-05, "loss": 0.0874, "num_input_tokens_seen": 24704256, "step": 24550 }, { "epoch": 11.577086280056577, "grad_norm": 0.8012834787368774, "learning_rate": 2.2488288097534116e-05, "loss": 0.1004, "num_input_tokens_seen": 24709888, "step": 24555 }, { "epoch": 11.579443658651579, "grad_norm": 0.15912842750549316, "learning_rate": 2.247805435056122e-05, "loss": 0.0516, "num_input_tokens_seen": 24714272, "step": 24560 }, { "epoch": 11.581801037246581, "grad_norm": 0.7276014685630798, "learning_rate": 2.246782103051091e-05, "loss": 0.0484, "num_input_tokens_seen": 24718976, "step": 24565 }, { "epoch": 11.584158415841584, "grad_norm": 0.965088427066803, "learning_rate": 2.2457588139115496e-05, "loss": 0.2398, "num_input_tokens_seen": 24724096, "step": 24570 }, { "epoch": 11.586515794436586, "grad_norm": 0.3577289879322052, "learning_rate": 2.2447355678107247e-05, "loss": 0.0229, "num_input_tokens_seen": 24729408, "step": 24575 }, { "epoch": 11.588873173031589, "grad_norm": 0.9099522233009338, "learning_rate": 2.2437123649218335e-05, "loss": 0.1776, "num_input_tokens_seen": 24734336, "step": 24580 }, { "epoch": 11.591230551626591, "grad_norm": 0.23522798717021942, "learning_rate": 2.242689205418088e-05, "loss": 0.0607, "num_input_tokens_seen": 24738432, "step": 24585 }, { "epoch": 11.593587930221593, "grad_norm": 1.4419084787368774, "learning_rate": 2.2416660894726908e-05, "loss": 0.0599, "num_input_tokens_seen": 24743904, "step": 24590 }, { "epoch": 11.595945308816596, "grad_norm": 0.05038844794034958, "learning_rate": 2.2406430172588386e-05, "loss": 0.0074, "num_input_tokens_seen": 24749088, "step": 24595 }, { "epoch": 11.598302687411598, "grad_norm": 0.5563836693763733, "learning_rate": 2.23961998894972e-05, "loss": 0.0574, "num_input_tokens_seen": 24753728, "step": 24600 }, { "epoch": 11.6006600660066, "grad_norm": 0.2513405680656433, "learning_rate": 2.2385970047185163e-05, "loss": 0.061, "num_input_tokens_seen": 24759072, "step": 24605 }, { "epoch": 11.603017444601603, "grad_norm": 1.713216781616211, "learning_rate": 2.237574064738401e-05, "loss": 0.2886, "num_input_tokens_seen": 24763744, "step": 24610 }, { "epoch": 11.605374823196605, "grad_norm": 0.46231409907341003, "learning_rate": 2.236551169182541e-05, "loss": 0.1427, "num_input_tokens_seen": 24768704, "step": 24615 }, { "epoch": 11.607732201791608, "grad_norm": 1.1958597898483276, "learning_rate": 2.235528318224095e-05, "loss": 0.165, "num_input_tokens_seen": 24773056, "step": 24620 }, { "epoch": 11.61008958038661, "grad_norm": 0.041749466210603714, "learning_rate": 2.2345055120362143e-05, "loss": 0.0909, "num_input_tokens_seen": 24778272, "step": 24625 }, { "epoch": 11.612446958981613, "grad_norm": 0.6927297711372375, "learning_rate": 2.2334827507920426e-05, "loss": 0.0843, "num_input_tokens_seen": 24783296, "step": 24630 }, { "epoch": 11.614804337576615, "grad_norm": 1.0848723649978638, "learning_rate": 2.2324600346647153e-05, "loss": 0.0875, "num_input_tokens_seen": 24789376, "step": 24635 }, { "epoch": 11.617161716171617, "grad_norm": 0.0906074121594429, "learning_rate": 2.231437363827362e-05, "loss": 0.1388, "num_input_tokens_seen": 24793824, "step": 24640 }, { "epoch": 11.61951909476662, "grad_norm": 0.18965540826320648, "learning_rate": 2.2304147384531038e-05, "loss": 0.0995, "num_input_tokens_seen": 24798976, "step": 24645 }, { "epoch": 11.621876473361622, "grad_norm": 0.3250619173049927, "learning_rate": 2.2293921587150524e-05, "loss": 0.1239, "num_input_tokens_seen": 24802912, "step": 24650 }, { "epoch": 11.624233851956625, "grad_norm": 3.229257345199585, "learning_rate": 2.2283696247863135e-05, "loss": 0.2277, "num_input_tokens_seen": 24808480, "step": 24655 }, { "epoch": 11.626591230551627, "grad_norm": 0.17733384668827057, "learning_rate": 2.2273471368399857e-05, "loss": 0.1573, "num_input_tokens_seen": 24813600, "step": 24660 }, { "epoch": 11.62894860914663, "grad_norm": 1.3542991876602173, "learning_rate": 2.2263246950491584e-05, "loss": 0.1155, "num_input_tokens_seen": 24818560, "step": 24665 }, { "epoch": 11.631305987741632, "grad_norm": 0.4526744484901428, "learning_rate": 2.225302299586913e-05, "loss": 0.1292, "num_input_tokens_seen": 24822400, "step": 24670 }, { "epoch": 11.633663366336634, "grad_norm": 1.297378659248352, "learning_rate": 2.224279950626324e-05, "loss": 0.0993, "num_input_tokens_seen": 24827232, "step": 24675 }, { "epoch": 11.636020744931637, "grad_norm": 0.8881766200065613, "learning_rate": 2.2232576483404582e-05, "loss": 0.0978, "num_input_tokens_seen": 24832096, "step": 24680 }, { "epoch": 11.638378123526639, "grad_norm": 0.01773044653236866, "learning_rate": 2.222235392902374e-05, "loss": 0.0566, "num_input_tokens_seen": 24836512, "step": 24685 }, { "epoch": 11.640735502121641, "grad_norm": 0.6309999227523804, "learning_rate": 2.2212131844851212e-05, "loss": 0.1573, "num_input_tokens_seen": 24841536, "step": 24690 }, { "epoch": 11.643092880716644, "grad_norm": 0.2243332415819168, "learning_rate": 2.220191023261743e-05, "loss": 0.1072, "num_input_tokens_seen": 24846176, "step": 24695 }, { "epoch": 11.645450259311646, "grad_norm": 1.4965355396270752, "learning_rate": 2.2191689094052736e-05, "loss": 0.1517, "num_input_tokens_seen": 24850144, "step": 24700 }, { "epoch": 11.647807637906649, "grad_norm": 2.308824300765991, "learning_rate": 2.21814684308874e-05, "loss": 0.1321, "num_input_tokens_seen": 24855008, "step": 24705 }, { "epoch": 11.65016501650165, "grad_norm": 0.1635081022977829, "learning_rate": 2.21712482448516e-05, "loss": 0.0585, "num_input_tokens_seen": 24861184, "step": 24710 }, { "epoch": 11.652522395096653, "grad_norm": 0.4825640916824341, "learning_rate": 2.2161028537675448e-05, "loss": 0.0481, "num_input_tokens_seen": 24866496, "step": 24715 }, { "epoch": 11.654879773691654, "grad_norm": 0.23394541442394257, "learning_rate": 2.2150809311088953e-05, "loss": 0.1199, "num_input_tokens_seen": 24872608, "step": 24720 }, { "epoch": 11.657237152286656, "grad_norm": 0.750826895236969, "learning_rate": 2.2140590566822063e-05, "loss": 0.0917, "num_input_tokens_seen": 24878752, "step": 24725 }, { "epoch": 11.659594530881659, "grad_norm": 0.10948271304368973, "learning_rate": 2.2130372306604643e-05, "loss": 0.0515, "num_input_tokens_seen": 24883648, "step": 24730 }, { "epoch": 11.661951909476661, "grad_norm": 0.4267961084842682, "learning_rate": 2.2120154532166474e-05, "loss": 0.0954, "num_input_tokens_seen": 24888416, "step": 24735 }, { "epoch": 11.664309288071664, "grad_norm": 1.8512078523635864, "learning_rate": 2.210993724523724e-05, "loss": 0.0472, "num_input_tokens_seen": 24894208, "step": 24740 }, { "epoch": 11.666666666666666, "grad_norm": 0.05279023200273514, "learning_rate": 2.2099720447546557e-05, "loss": 0.058, "num_input_tokens_seen": 24899936, "step": 24745 }, { "epoch": 11.669024045261668, "grad_norm": 1.0714054107666016, "learning_rate": 2.208950414082396e-05, "loss": 0.1445, "num_input_tokens_seen": 24904928, "step": 24750 }, { "epoch": 11.67138142385667, "grad_norm": 0.5701895356178284, "learning_rate": 2.2079288326798902e-05, "loss": 0.3727, "num_input_tokens_seen": 24909408, "step": 24755 }, { "epoch": 11.673738802451673, "grad_norm": 0.046998389065265656, "learning_rate": 2.2069073007200733e-05, "loss": 0.1505, "num_input_tokens_seen": 24913088, "step": 24760 }, { "epoch": 11.676096181046676, "grad_norm": 0.8662429451942444, "learning_rate": 2.2058858183758738e-05, "loss": 0.1626, "num_input_tokens_seen": 24917152, "step": 24765 }, { "epoch": 11.678453559641678, "grad_norm": 1.2552438974380493, "learning_rate": 2.204864385820212e-05, "loss": 0.2396, "num_input_tokens_seen": 24921664, "step": 24770 }, { "epoch": 11.68081093823668, "grad_norm": 1.1275438070297241, "learning_rate": 2.2038430032259983e-05, "loss": 0.1598, "num_input_tokens_seen": 24926144, "step": 24775 }, { "epoch": 11.683168316831683, "grad_norm": 0.0748269334435463, "learning_rate": 2.2028216707661357e-05, "loss": 0.1124, "num_input_tokens_seen": 24931904, "step": 24780 }, { "epoch": 11.685525695426685, "grad_norm": 0.011839639395475388, "learning_rate": 2.201800388613518e-05, "loss": 0.0381, "num_input_tokens_seen": 24936352, "step": 24785 }, { "epoch": 11.687883074021688, "grad_norm": 0.08628162741661072, "learning_rate": 2.2007791569410318e-05, "loss": 0.0768, "num_input_tokens_seen": 24940960, "step": 24790 }, { "epoch": 11.69024045261669, "grad_norm": 0.159781813621521, "learning_rate": 2.1997579759215543e-05, "loss": 0.0532, "num_input_tokens_seen": 24947328, "step": 24795 }, { "epoch": 11.692597831211692, "grad_norm": 0.2906974256038666, "learning_rate": 2.1987368457279527e-05, "loss": 0.0965, "num_input_tokens_seen": 24951936, "step": 24800 }, { "epoch": 11.694955209806695, "grad_norm": 0.07621894031763077, "learning_rate": 2.1977157665330884e-05, "loss": 0.1043, "num_input_tokens_seen": 24958208, "step": 24805 }, { "epoch": 11.697312588401697, "grad_norm": 1.603488802909851, "learning_rate": 2.1966947385098125e-05, "loss": 0.235, "num_input_tokens_seen": 24963552, "step": 24810 }, { "epoch": 11.6996699669967, "grad_norm": 0.3935483396053314, "learning_rate": 2.1956737618309674e-05, "loss": 0.1501, "num_input_tokens_seen": 24968352, "step": 24815 }, { "epoch": 11.702027345591702, "grad_norm": 0.27482491731643677, "learning_rate": 2.1946528366693875e-05, "loss": 0.0219, "num_input_tokens_seen": 24972800, "step": 24820 }, { "epoch": 11.704384724186705, "grad_norm": 0.02721700631082058, "learning_rate": 2.1936319631978974e-05, "loss": 0.1378, "num_input_tokens_seen": 24977664, "step": 24825 }, { "epoch": 11.706742102781707, "grad_norm": 0.7784610390663147, "learning_rate": 2.192611141589314e-05, "loss": 0.0494, "num_input_tokens_seen": 24981920, "step": 24830 }, { "epoch": 11.70909948137671, "grad_norm": 0.827428936958313, "learning_rate": 2.191590372016446e-05, "loss": 0.0865, "num_input_tokens_seen": 24988160, "step": 24835 }, { "epoch": 11.711456859971712, "grad_norm": 1.3134756088256836, "learning_rate": 2.190569654652091e-05, "loss": 0.1466, "num_input_tokens_seen": 24994496, "step": 24840 }, { "epoch": 11.713814238566714, "grad_norm": 0.4200737476348877, "learning_rate": 2.1895489896690393e-05, "loss": 0.0552, "num_input_tokens_seen": 25000320, "step": 24845 }, { "epoch": 11.716171617161717, "grad_norm": 0.09279226511716843, "learning_rate": 2.1885283772400727e-05, "loss": 0.0346, "num_input_tokens_seen": 25004416, "step": 24850 }, { "epoch": 11.718528995756719, "grad_norm": 1.0266798734664917, "learning_rate": 2.187507817537964e-05, "loss": 0.0586, "num_input_tokens_seen": 25009824, "step": 24855 }, { "epoch": 11.720886374351721, "grad_norm": 0.5385528206825256, "learning_rate": 2.1864873107354752e-05, "loss": 0.0991, "num_input_tokens_seen": 25015136, "step": 24860 }, { "epoch": 11.723243752946724, "grad_norm": 0.2221657782793045, "learning_rate": 2.1854668570053616e-05, "loss": 0.2131, "num_input_tokens_seen": 25020576, "step": 24865 }, { "epoch": 11.725601131541726, "grad_norm": 0.43194901943206787, "learning_rate": 2.184446456520369e-05, "loss": 0.0218, "num_input_tokens_seen": 25024736, "step": 24870 }, { "epoch": 11.727958510136729, "grad_norm": 1.2767107486724854, "learning_rate": 2.1834261094532338e-05, "loss": 0.1021, "num_input_tokens_seen": 25029664, "step": 24875 }, { "epoch": 11.730315888731731, "grad_norm": 0.16240552067756653, "learning_rate": 2.1824058159766824e-05, "loss": 0.1045, "num_input_tokens_seen": 25034624, "step": 24880 }, { "epoch": 11.732673267326733, "grad_norm": 0.2618301510810852, "learning_rate": 2.1813855762634342e-05, "loss": 0.1535, "num_input_tokens_seen": 25038688, "step": 24885 }, { "epoch": 11.735030645921736, "grad_norm": 0.39230743050575256, "learning_rate": 2.1803653904861978e-05, "loss": 0.0477, "num_input_tokens_seen": 25044832, "step": 24890 }, { "epoch": 11.737388024516738, "grad_norm": 0.15051087737083435, "learning_rate": 2.1793452588176747e-05, "loss": 0.1747, "num_input_tokens_seen": 25049696, "step": 24895 }, { "epoch": 11.73974540311174, "grad_norm": 0.7101361155509949, "learning_rate": 2.178325181430554e-05, "loss": 0.1231, "num_input_tokens_seen": 25054848, "step": 24900 }, { "epoch": 11.742102781706743, "grad_norm": 0.631935715675354, "learning_rate": 2.1773051584975186e-05, "loss": 0.1372, "num_input_tokens_seen": 25059104, "step": 24905 }, { "epoch": 11.744460160301745, "grad_norm": 0.4786868989467621, "learning_rate": 2.1762851901912406e-05, "loss": 0.2243, "num_input_tokens_seen": 25064096, "step": 24910 }, { "epoch": 11.746817538896746, "grad_norm": 0.03839823231101036, "learning_rate": 2.175265276684384e-05, "loss": 0.0521, "num_input_tokens_seen": 25069216, "step": 24915 }, { "epoch": 11.749174917491748, "grad_norm": 1.5974210500717163, "learning_rate": 2.174245418149602e-05, "loss": 0.2958, "num_input_tokens_seen": 25073952, "step": 24920 }, { "epoch": 11.75153229608675, "grad_norm": 0.36947616934776306, "learning_rate": 2.17322561475954e-05, "loss": 0.0465, "num_input_tokens_seen": 25079040, "step": 24925 }, { "epoch": 11.753889674681753, "grad_norm": 0.530179500579834, "learning_rate": 2.1722058666868326e-05, "loss": 0.1111, "num_input_tokens_seen": 25084320, "step": 24930 }, { "epoch": 11.756247053276756, "grad_norm": 0.44307419657707214, "learning_rate": 2.171186174104108e-05, "loss": 0.0404, "num_input_tokens_seen": 25089760, "step": 24935 }, { "epoch": 11.758604431871758, "grad_norm": 0.7532908916473389, "learning_rate": 2.1701665371839797e-05, "loss": 0.0562, "num_input_tokens_seen": 25095840, "step": 24940 }, { "epoch": 11.76096181046676, "grad_norm": 0.10672253370285034, "learning_rate": 2.169146956099057e-05, "loss": 0.0898, "num_input_tokens_seen": 25100640, "step": 24945 }, { "epoch": 11.763319189061763, "grad_norm": 1.953614592552185, "learning_rate": 2.1681274310219367e-05, "loss": 0.3545, "num_input_tokens_seen": 25106016, "step": 24950 }, { "epoch": 11.765676567656765, "grad_norm": 0.18144281208515167, "learning_rate": 2.1671079621252072e-05, "loss": 0.0178, "num_input_tokens_seen": 25110912, "step": 24955 }, { "epoch": 11.768033946251768, "grad_norm": 1.2742722034454346, "learning_rate": 2.1660885495814476e-05, "loss": 0.0861, "num_input_tokens_seen": 25115200, "step": 24960 }, { "epoch": 11.77039132484677, "grad_norm": 0.16025106608867645, "learning_rate": 2.1650691935632276e-05, "loss": 0.1555, "num_input_tokens_seen": 25119072, "step": 24965 }, { "epoch": 11.772748703441772, "grad_norm": 0.6609090566635132, "learning_rate": 2.1640498942431058e-05, "loss": 0.1953, "num_input_tokens_seen": 25124320, "step": 24970 }, { "epoch": 11.775106082036775, "grad_norm": 0.861893355846405, "learning_rate": 2.163030651793633e-05, "loss": 0.142, "num_input_tokens_seen": 25129056, "step": 24975 }, { "epoch": 11.777463460631777, "grad_norm": 0.45203903317451477, "learning_rate": 2.1620114663873492e-05, "loss": 0.1761, "num_input_tokens_seen": 25133952, "step": 24980 }, { "epoch": 11.77982083922678, "grad_norm": 0.40186771750450134, "learning_rate": 2.160992338196786e-05, "loss": 0.1508, "num_input_tokens_seen": 25138976, "step": 24985 }, { "epoch": 11.782178217821782, "grad_norm": 0.7531415820121765, "learning_rate": 2.1599732673944634e-05, "loss": 0.152, "num_input_tokens_seen": 25143488, "step": 24990 }, { "epoch": 11.784535596416784, "grad_norm": 1.6144723892211914, "learning_rate": 2.158954254152893e-05, "loss": 0.204, "num_input_tokens_seen": 25148224, "step": 24995 }, { "epoch": 11.786892975011787, "grad_norm": 1.4677971601486206, "learning_rate": 2.1579352986445772e-05, "loss": 0.1061, "num_input_tokens_seen": 25154144, "step": 25000 }, { "epoch": 11.78925035360679, "grad_norm": 0.13105568289756775, "learning_rate": 2.156916401042008e-05, "loss": 0.0281, "num_input_tokens_seen": 25158656, "step": 25005 }, { "epoch": 11.791607732201792, "grad_norm": 1.2316029071807861, "learning_rate": 2.1558975615176658e-05, "loss": 0.1115, "num_input_tokens_seen": 25163488, "step": 25010 }, { "epoch": 11.793965110796794, "grad_norm": 0.1551487147808075, "learning_rate": 2.1548787802440244e-05, "loss": 0.0788, "num_input_tokens_seen": 25168256, "step": 25015 }, { "epoch": 11.796322489391796, "grad_norm": 0.03743300959467888, "learning_rate": 2.153860057393545e-05, "loss": 0.0709, "num_input_tokens_seen": 25172672, "step": 25020 }, { "epoch": 11.798679867986799, "grad_norm": 0.5192599296569824, "learning_rate": 2.152841393138682e-05, "loss": 0.3341, "num_input_tokens_seen": 25177760, "step": 25025 }, { "epoch": 11.801037246581801, "grad_norm": 0.2614092230796814, "learning_rate": 2.151822787651876e-05, "loss": 0.1762, "num_input_tokens_seen": 25182752, "step": 25030 }, { "epoch": 11.803394625176804, "grad_norm": 0.03545684367418289, "learning_rate": 2.1508042411055606e-05, "loss": 0.3467, "num_input_tokens_seen": 25187168, "step": 25035 }, { "epoch": 11.805752003771806, "grad_norm": 0.2037430703639984, "learning_rate": 2.149785753672158e-05, "loss": 0.1385, "num_input_tokens_seen": 25191552, "step": 25040 }, { "epoch": 11.808109382366808, "grad_norm": 0.15114876627922058, "learning_rate": 2.1487673255240816e-05, "loss": 0.1239, "num_input_tokens_seen": 25196256, "step": 25045 }, { "epoch": 11.81046676096181, "grad_norm": 1.184826374053955, "learning_rate": 2.1477489568337324e-05, "loss": 0.0316, "num_input_tokens_seen": 25201632, "step": 25050 }, { "epoch": 11.812824139556813, "grad_norm": 1.6389726400375366, "learning_rate": 2.1467306477735042e-05, "loss": 0.0649, "num_input_tokens_seen": 25206656, "step": 25055 }, { "epoch": 11.815181518151816, "grad_norm": 0.21327638626098633, "learning_rate": 2.145712398515779e-05, "loss": 0.1206, "num_input_tokens_seen": 25211104, "step": 25060 }, { "epoch": 11.817538896746818, "grad_norm": 2.783848524093628, "learning_rate": 2.1446942092329303e-05, "loss": 0.2116, "num_input_tokens_seen": 25215936, "step": 25065 }, { "epoch": 11.81989627534182, "grad_norm": 1.2695064544677734, "learning_rate": 2.143676080097318e-05, "loss": 0.2557, "num_input_tokens_seen": 25221984, "step": 25070 }, { "epoch": 11.822253653936823, "grad_norm": 1.8397151231765747, "learning_rate": 2.1426580112812962e-05, "loss": 0.0727, "num_input_tokens_seen": 25226720, "step": 25075 }, { "epoch": 11.824611032531825, "grad_norm": 0.1390901505947113, "learning_rate": 2.1416400029572052e-05, "loss": 0.0186, "num_input_tokens_seen": 25232672, "step": 25080 }, { "epoch": 11.826968411126828, "grad_norm": 1.3614243268966675, "learning_rate": 2.1406220552973777e-05, "loss": 0.2457, "num_input_tokens_seen": 25237856, "step": 25085 }, { "epoch": 11.82932578972183, "grad_norm": 0.4527030289173126, "learning_rate": 2.1396041684741335e-05, "loss": 0.1957, "num_input_tokens_seen": 25242208, "step": 25090 }, { "epoch": 11.831683168316832, "grad_norm": 1.3263221979141235, "learning_rate": 2.1385863426597847e-05, "loss": 0.1623, "num_input_tokens_seen": 25247296, "step": 25095 }, { "epoch": 11.834040546911835, "grad_norm": 0.7025123834609985, "learning_rate": 2.137568578026632e-05, "loss": 0.0241, "num_input_tokens_seen": 25252480, "step": 25100 }, { "epoch": 11.836397925506837, "grad_norm": 1.1336921453475952, "learning_rate": 2.1365508747469655e-05, "loss": 0.3384, "num_input_tokens_seen": 25257536, "step": 25105 }, { "epoch": 11.838755304101838, "grad_norm": 0.6025177836418152, "learning_rate": 2.1355332329930642e-05, "loss": 0.1106, "num_input_tokens_seen": 25262720, "step": 25110 }, { "epoch": 11.841112682696842, "grad_norm": 0.381564736366272, "learning_rate": 2.1345156529371983e-05, "loss": 0.0746, "num_input_tokens_seen": 25268000, "step": 25115 }, { "epoch": 11.843470061291843, "grad_norm": 0.16275882720947266, "learning_rate": 2.133498134751627e-05, "loss": 0.1218, "num_input_tokens_seen": 25272800, "step": 25120 }, { "epoch": 11.845827439886845, "grad_norm": 0.11172161251306534, "learning_rate": 2.1324806786085992e-05, "loss": 0.0207, "num_input_tokens_seen": 25278240, "step": 25125 }, { "epoch": 11.848184818481847, "grad_norm": 1.4782929420471191, "learning_rate": 2.1314632846803515e-05, "loss": 0.1504, "num_input_tokens_seen": 25282880, "step": 25130 }, { "epoch": 11.85054219707685, "grad_norm": 1.2631540298461914, "learning_rate": 2.1304459531391124e-05, "loss": 0.0644, "num_input_tokens_seen": 25288128, "step": 25135 }, { "epoch": 11.852899575671852, "grad_norm": 1.2308701276779175, "learning_rate": 2.129428684157099e-05, "loss": 0.1099, "num_input_tokens_seen": 25293536, "step": 25140 }, { "epoch": 11.855256954266855, "grad_norm": 0.5493714213371277, "learning_rate": 2.1284114779065177e-05, "loss": 0.1722, "num_input_tokens_seen": 25297984, "step": 25145 }, { "epoch": 11.857614332861857, "grad_norm": 1.6563944816589355, "learning_rate": 2.1273943345595637e-05, "loss": 0.2045, "num_input_tokens_seen": 25302656, "step": 25150 }, { "epoch": 11.85997171145686, "grad_norm": 0.3126141428947449, "learning_rate": 2.1263772542884224e-05, "loss": 0.1052, "num_input_tokens_seen": 25307072, "step": 25155 }, { "epoch": 11.862329090051862, "grad_norm": 0.5844438076019287, "learning_rate": 2.1253602372652692e-05, "loss": 0.0859, "num_input_tokens_seen": 25311616, "step": 25160 }, { "epoch": 11.864686468646864, "grad_norm": 0.30135005712509155, "learning_rate": 2.1243432836622656e-05, "loss": 0.0364, "num_input_tokens_seen": 25316480, "step": 25165 }, { "epoch": 11.867043847241867, "grad_norm": 0.5666851997375488, "learning_rate": 2.1233263936515657e-05, "loss": 0.0268, "num_input_tokens_seen": 25322304, "step": 25170 }, { "epoch": 11.869401225836869, "grad_norm": 0.15140628814697266, "learning_rate": 2.1223095674053125e-05, "loss": 0.1254, "num_input_tokens_seen": 25326944, "step": 25175 }, { "epoch": 11.871758604431871, "grad_norm": 0.17011107504367828, "learning_rate": 2.1212928050956362e-05, "loss": 0.0677, "num_input_tokens_seen": 25332128, "step": 25180 }, { "epoch": 11.874115983026874, "grad_norm": 1.3852314949035645, "learning_rate": 2.120276106894658e-05, "loss": 0.2183, "num_input_tokens_seen": 25336768, "step": 25185 }, { "epoch": 11.876473361621876, "grad_norm": 0.1501593440771103, "learning_rate": 2.1192594729744876e-05, "loss": 0.1129, "num_input_tokens_seen": 25341728, "step": 25190 }, { "epoch": 11.878830740216879, "grad_norm": 0.8736999034881592, "learning_rate": 2.118242903507224e-05, "loss": 0.2055, "num_input_tokens_seen": 25345888, "step": 25195 }, { "epoch": 11.881188118811881, "grad_norm": 0.12865017354488373, "learning_rate": 2.117226398664955e-05, "loss": 0.0549, "num_input_tokens_seen": 25350976, "step": 25200 }, { "epoch": 11.883545497406883, "grad_norm": 0.31540387868881226, "learning_rate": 2.116209958619757e-05, "loss": 0.0319, "num_input_tokens_seen": 25355584, "step": 25205 }, { "epoch": 11.885902876001886, "grad_norm": 0.5070163607597351, "learning_rate": 2.1151935835436967e-05, "loss": 0.0743, "num_input_tokens_seen": 25360544, "step": 25210 }, { "epoch": 11.888260254596888, "grad_norm": 0.3483172357082367, "learning_rate": 2.11417727360883e-05, "loss": 0.086, "num_input_tokens_seen": 25365280, "step": 25215 }, { "epoch": 11.89061763319189, "grad_norm": 2.647056818008423, "learning_rate": 2.1131610289871988e-05, "loss": 0.0778, "num_input_tokens_seen": 25369952, "step": 25220 }, { "epoch": 11.892975011786893, "grad_norm": 0.1542648822069168, "learning_rate": 2.112144849850838e-05, "loss": 0.1867, "num_input_tokens_seen": 25375136, "step": 25225 }, { "epoch": 11.895332390381895, "grad_norm": 1.3321589231491089, "learning_rate": 2.111128736371768e-05, "loss": 0.1361, "num_input_tokens_seen": 25381152, "step": 25230 }, { "epoch": 11.897689768976898, "grad_norm": 0.02462528645992279, "learning_rate": 2.1101126887220014e-05, "loss": 0.0989, "num_input_tokens_seen": 25387008, "step": 25235 }, { "epoch": 11.9000471475719, "grad_norm": 0.014160643331706524, "learning_rate": 2.1090967070735357e-05, "loss": 0.0634, "num_input_tokens_seen": 25391616, "step": 25240 }, { "epoch": 11.902404526166903, "grad_norm": 0.11822683364152908, "learning_rate": 2.1080807915983607e-05, "loss": 0.0251, "num_input_tokens_seen": 25397760, "step": 25245 }, { "epoch": 11.904761904761905, "grad_norm": 0.08636298775672913, "learning_rate": 2.107064942468453e-05, "loss": 0.1086, "num_input_tokens_seen": 25403392, "step": 25250 }, { "epoch": 11.907119283356908, "grad_norm": 0.8092935681343079, "learning_rate": 2.1060491598557795e-05, "loss": 0.0454, "num_input_tokens_seen": 25408032, "step": 25255 }, { "epoch": 11.90947666195191, "grad_norm": 0.9659920334815979, "learning_rate": 2.105033443932294e-05, "loss": 0.0698, "num_input_tokens_seen": 25412640, "step": 25260 }, { "epoch": 11.911834040546912, "grad_norm": 0.6973311901092529, "learning_rate": 2.10401779486994e-05, "loss": 0.0789, "num_input_tokens_seen": 25418336, "step": 25265 }, { "epoch": 11.914191419141915, "grad_norm": 0.5242640972137451, "learning_rate": 2.1030022128406502e-05, "loss": 0.0577, "num_input_tokens_seen": 25422688, "step": 25270 }, { "epoch": 11.916548797736917, "grad_norm": 1.2724729776382446, "learning_rate": 2.1019866980163456e-05, "loss": 0.2053, "num_input_tokens_seen": 25426784, "step": 25275 }, { "epoch": 11.91890617633192, "grad_norm": 0.9166381359100342, "learning_rate": 2.1009712505689345e-05, "loss": 0.039, "num_input_tokens_seen": 25432448, "step": 25280 }, { "epoch": 11.921263554926922, "grad_norm": 0.0676610916852951, "learning_rate": 2.0999558706703156e-05, "loss": 0.0314, "num_input_tokens_seen": 25437568, "step": 25285 }, { "epoch": 11.923620933521924, "grad_norm": 0.4701225459575653, "learning_rate": 2.098940558492375e-05, "loss": 0.1012, "num_input_tokens_seen": 25442016, "step": 25290 }, { "epoch": 11.925978312116927, "grad_norm": 1.456315040588379, "learning_rate": 2.0979253142069894e-05, "loss": 0.2196, "num_input_tokens_seen": 25446080, "step": 25295 }, { "epoch": 11.92833569071193, "grad_norm": 1.4469963312149048, "learning_rate": 2.0969101379860207e-05, "loss": 0.1493, "num_input_tokens_seen": 25450592, "step": 25300 }, { "epoch": 11.930693069306932, "grad_norm": 0.17876754701137543, "learning_rate": 2.0958950300013212e-05, "loss": 0.0315, "num_input_tokens_seen": 25455392, "step": 25305 }, { "epoch": 11.933050447901934, "grad_norm": 0.39956608414649963, "learning_rate": 2.094879990424732e-05, "loss": 0.2602, "num_input_tokens_seen": 25461376, "step": 25310 }, { "epoch": 11.935407826496935, "grad_norm": 0.7442681193351746, "learning_rate": 2.0938650194280823e-05, "loss": 0.1017, "num_input_tokens_seen": 25465920, "step": 25315 }, { "epoch": 11.937765205091937, "grad_norm": 1.3613029718399048, "learning_rate": 2.092850117183189e-05, "loss": 0.1531, "num_input_tokens_seen": 25471168, "step": 25320 }, { "epoch": 11.94012258368694, "grad_norm": 0.5050333142280579, "learning_rate": 2.091835283861857e-05, "loss": 0.1021, "num_input_tokens_seen": 25478464, "step": 25325 }, { "epoch": 11.942479962281942, "grad_norm": 0.4065198302268982, "learning_rate": 2.090820519635882e-05, "loss": 0.0412, "num_input_tokens_seen": 25483904, "step": 25330 }, { "epoch": 11.944837340876944, "grad_norm": 0.10835705697536469, "learning_rate": 2.0898058246770456e-05, "loss": 0.097, "num_input_tokens_seen": 25488608, "step": 25335 }, { "epoch": 11.947194719471947, "grad_norm": 1.7478396892547607, "learning_rate": 2.088791199157118e-05, "loss": 0.299, "num_input_tokens_seen": 25492960, "step": 25340 }, { "epoch": 11.949552098066949, "grad_norm": 0.17672379314899445, "learning_rate": 2.0877766432478586e-05, "loss": 0.0298, "num_input_tokens_seen": 25497152, "step": 25345 }, { "epoch": 11.951909476661951, "grad_norm": 0.9711182713508606, "learning_rate": 2.086762157121014e-05, "loss": 0.071, "num_input_tokens_seen": 25501824, "step": 25350 }, { "epoch": 11.954266855256954, "grad_norm": 0.5459961891174316, "learning_rate": 2.0857477409483206e-05, "loss": 0.0277, "num_input_tokens_seen": 25508064, "step": 25355 }, { "epoch": 11.956624233851956, "grad_norm": 0.06119091063737869, "learning_rate": 2.0847333949015006e-05, "loss": 0.1124, "num_input_tokens_seen": 25512352, "step": 25360 }, { "epoch": 11.958981612446959, "grad_norm": 2.376254081726074, "learning_rate": 2.0837191191522657e-05, "loss": 0.303, "num_input_tokens_seen": 25518560, "step": 25365 }, { "epoch": 11.961338991041961, "grad_norm": 0.03446148708462715, "learning_rate": 2.082704913872316e-05, "loss": 0.0219, "num_input_tokens_seen": 25522880, "step": 25370 }, { "epoch": 11.963696369636963, "grad_norm": 0.14182119071483612, "learning_rate": 2.08169077923334e-05, "loss": 0.0748, "num_input_tokens_seen": 25528032, "step": 25375 }, { "epoch": 11.966053748231966, "grad_norm": 1.283300518989563, "learning_rate": 2.0806767154070118e-05, "loss": 0.0936, "num_input_tokens_seen": 25533792, "step": 25380 }, { "epoch": 11.968411126826968, "grad_norm": 0.4130665063858032, "learning_rate": 2.0796627225649967e-05, "loss": 0.1522, "num_input_tokens_seen": 25538784, "step": 25385 }, { "epoch": 11.97076850542197, "grad_norm": 0.4801539480686188, "learning_rate": 2.078648800878945e-05, "loss": 0.1773, "num_input_tokens_seen": 25545024, "step": 25390 }, { "epoch": 11.973125884016973, "grad_norm": 1.395780324935913, "learning_rate": 2.0776349505204972e-05, "loss": 0.0319, "num_input_tokens_seen": 25548800, "step": 25395 }, { "epoch": 11.975483262611975, "grad_norm": 0.19405803084373474, "learning_rate": 2.0766211716612816e-05, "loss": 0.09, "num_input_tokens_seen": 25553056, "step": 25400 }, { "epoch": 11.977840641206978, "grad_norm": 0.6979353427886963, "learning_rate": 2.0756074644729124e-05, "loss": 0.2047, "num_input_tokens_seen": 25558112, "step": 25405 }, { "epoch": 11.98019801980198, "grad_norm": 0.9130294919013977, "learning_rate": 2.0745938291269932e-05, "loss": 0.281, "num_input_tokens_seen": 25562688, "step": 25410 }, { "epoch": 11.982555398396983, "grad_norm": 0.2627260982990265, "learning_rate": 2.0735802657951166e-05, "loss": 0.1104, "num_input_tokens_seen": 25567936, "step": 25415 }, { "epoch": 11.984912776991985, "grad_norm": 0.056343402713537216, "learning_rate": 2.072566774648861e-05, "loss": 0.0132, "num_input_tokens_seen": 25572320, "step": 25420 }, { "epoch": 11.987270155586987, "grad_norm": 1.1558871269226074, "learning_rate": 2.0715533558597922e-05, "loss": 0.0641, "num_input_tokens_seen": 25576800, "step": 25425 }, { "epoch": 11.98962753418199, "grad_norm": 0.44119343161582947, "learning_rate": 2.070540009599466e-05, "loss": 0.0562, "num_input_tokens_seen": 25581408, "step": 25430 }, { "epoch": 11.991984912776992, "grad_norm": 2.566732883453369, "learning_rate": 2.0695267360394245e-05, "loss": 0.0961, "num_input_tokens_seen": 25585728, "step": 25435 }, { "epoch": 11.994342291371995, "grad_norm": 2.214256763458252, "learning_rate": 2.0685135353511974e-05, "loss": 0.0538, "num_input_tokens_seen": 25591424, "step": 25440 }, { "epoch": 11.996699669966997, "grad_norm": 0.1360449194908142, "learning_rate": 2.0675004077063026e-05, "loss": 0.1025, "num_input_tokens_seen": 25596128, "step": 25445 }, { "epoch": 11.999057048562, "grad_norm": 0.08938706666231155, "learning_rate": 2.0664873532762452e-05, "loss": 0.0925, "num_input_tokens_seen": 25600288, "step": 25450 }, { "epoch": 12.0, "eval_loss": 0.1521083414554596, "eval_runtime": 15.0863, "eval_samples_per_second": 62.507, "eval_steps_per_second": 15.643, "num_input_tokens_seen": 25602144, "step": 25452 }, { "epoch": 12.001414427157002, "grad_norm": 0.1098584309220314, "learning_rate": 2.0654743722325186e-05, "loss": 0.1475, "num_input_tokens_seen": 25605408, "step": 25455 }, { "epoch": 12.003771805752004, "grad_norm": 1.2340846061706543, "learning_rate": 2.064461464746603e-05, "loss": 0.1234, "num_input_tokens_seen": 25609760, "step": 25460 }, { "epoch": 12.006129184347007, "grad_norm": 0.57427978515625, "learning_rate": 2.0634486309899657e-05, "loss": 0.0854, "num_input_tokens_seen": 25615296, "step": 25465 }, { "epoch": 12.008486562942009, "grad_norm": 0.44399842619895935, "learning_rate": 2.0624358711340634e-05, "loss": 0.1065, "num_input_tokens_seen": 25619840, "step": 25470 }, { "epoch": 12.010843941537011, "grad_norm": 0.2655787765979767, "learning_rate": 2.061423185350339e-05, "loss": 0.0909, "num_input_tokens_seen": 25624416, "step": 25475 }, { "epoch": 12.013201320132014, "grad_norm": 0.15290924906730652, "learning_rate": 2.0604105738102225e-05, "loss": 0.0402, "num_input_tokens_seen": 25629312, "step": 25480 }, { "epoch": 12.015558698727016, "grad_norm": 1.4139834642410278, "learning_rate": 2.0593980366851316e-05, "loss": 0.2107, "num_input_tokens_seen": 25635520, "step": 25485 }, { "epoch": 12.017916077322019, "grad_norm": 0.5131872296333313, "learning_rate": 2.0583855741464725e-05, "loss": 0.3121, "num_input_tokens_seen": 25639840, "step": 25490 }, { "epoch": 12.020273455917021, "grad_norm": 0.014422750100493431, "learning_rate": 2.0573731863656376e-05, "loss": 0.0573, "num_input_tokens_seen": 25644224, "step": 25495 }, { "epoch": 12.022630834512023, "grad_norm": 1.275730013847351, "learning_rate": 2.056360873514007e-05, "loss": 0.135, "num_input_tokens_seen": 25649440, "step": 25500 }, { "epoch": 12.024988213107026, "grad_norm": 0.48457539081573486, "learning_rate": 2.0553486357629474e-05, "loss": 0.0774, "num_input_tokens_seen": 25653952, "step": 25505 }, { "epoch": 12.027345591702028, "grad_norm": 1.2072455883026123, "learning_rate": 2.0543364732838143e-05, "loss": 0.0491, "num_input_tokens_seen": 25658240, "step": 25510 }, { "epoch": 12.029702970297029, "grad_norm": 0.09556149691343307, "learning_rate": 2.0533243862479494e-05, "loss": 0.1467, "num_input_tokens_seen": 25662784, "step": 25515 }, { "epoch": 12.032060348892031, "grad_norm": 0.2231968492269516, "learning_rate": 2.052312374826682e-05, "loss": 0.1262, "num_input_tokens_seen": 25668992, "step": 25520 }, { "epoch": 12.034417727487034, "grad_norm": 0.611928403377533, "learning_rate": 2.051300439191328e-05, "loss": 0.1313, "num_input_tokens_seen": 25673344, "step": 25525 }, { "epoch": 12.036775106082036, "grad_norm": 1.212814211845398, "learning_rate": 2.050288579513191e-05, "loss": 0.1028, "num_input_tokens_seen": 25679104, "step": 25530 }, { "epoch": 12.039132484677038, "grad_norm": 1.0965484380722046, "learning_rate": 2.0492767959635618e-05, "loss": 0.0842, "num_input_tokens_seen": 25683136, "step": 25535 }, { "epoch": 12.04148986327204, "grad_norm": 0.20326027274131775, "learning_rate": 2.048265088713719e-05, "loss": 0.1378, "num_input_tokens_seen": 25689088, "step": 25540 }, { "epoch": 12.043847241867043, "grad_norm": 0.7672703266143799, "learning_rate": 2.0472534579349256e-05, "loss": 0.1063, "num_input_tokens_seen": 25693952, "step": 25545 }, { "epoch": 12.046204620462046, "grad_norm": 0.7328671216964722, "learning_rate": 2.046241903798435e-05, "loss": 0.0585, "num_input_tokens_seen": 25697984, "step": 25550 }, { "epoch": 12.048561999057048, "grad_norm": 0.15652580559253693, "learning_rate": 2.0452304264754854e-05, "loss": 0.3135, "num_input_tokens_seen": 25702144, "step": 25555 }, { "epoch": 12.05091937765205, "grad_norm": 0.5584300756454468, "learning_rate": 2.044219026137304e-05, "loss": 0.0964, "num_input_tokens_seen": 25707968, "step": 25560 }, { "epoch": 12.053276756247053, "grad_norm": 1.1199671030044556, "learning_rate": 2.043207702955102e-05, "loss": 0.1715, "num_input_tokens_seen": 25712736, "step": 25565 }, { "epoch": 12.055634134842055, "grad_norm": 0.3970234990119934, "learning_rate": 2.0421964571000797e-05, "loss": 0.1213, "num_input_tokens_seen": 25718176, "step": 25570 }, { "epoch": 12.057991513437058, "grad_norm": 0.36489129066467285, "learning_rate": 2.0411852887434246e-05, "loss": 0.2551, "num_input_tokens_seen": 25722400, "step": 25575 }, { "epoch": 12.06034889203206, "grad_norm": 1.3381140232086182, "learning_rate": 2.04017419805631e-05, "loss": 0.2008, "num_input_tokens_seen": 25727200, "step": 25580 }, { "epoch": 12.062706270627062, "grad_norm": 1.1764881610870361, "learning_rate": 2.0391631852098964e-05, "loss": 0.1148, "num_input_tokens_seen": 25732896, "step": 25585 }, { "epoch": 12.065063649222065, "grad_norm": 0.5429256558418274, "learning_rate": 2.0381522503753304e-05, "loss": 0.1021, "num_input_tokens_seen": 25739776, "step": 25590 }, { "epoch": 12.067421027817067, "grad_norm": 1.8312867879867554, "learning_rate": 2.037141393723747e-05, "loss": 0.1742, "num_input_tokens_seen": 25743840, "step": 25595 }, { "epoch": 12.06977840641207, "grad_norm": 0.9025647044181824, "learning_rate": 2.0361306154262668e-05, "loss": 0.2091, "num_input_tokens_seen": 25749440, "step": 25600 }, { "epoch": 12.072135785007072, "grad_norm": 0.11282454431056976, "learning_rate": 2.0351199156539972e-05, "loss": 0.0861, "num_input_tokens_seen": 25754016, "step": 25605 }, { "epoch": 12.074493163602074, "grad_norm": 0.23800644278526306, "learning_rate": 2.034109294578034e-05, "loss": 0.0985, "num_input_tokens_seen": 25758368, "step": 25610 }, { "epoch": 12.076850542197077, "grad_norm": 0.5226706862449646, "learning_rate": 2.0330987523694552e-05, "loss": 0.0503, "num_input_tokens_seen": 25763264, "step": 25615 }, { "epoch": 12.07920792079208, "grad_norm": 0.13404987752437592, "learning_rate": 2.0320882891993305e-05, "loss": 0.023, "num_input_tokens_seen": 25768576, "step": 25620 }, { "epoch": 12.081565299387082, "grad_norm": 0.2667337954044342, "learning_rate": 2.0310779052387136e-05, "loss": 0.0552, "num_input_tokens_seen": 25773440, "step": 25625 }, { "epoch": 12.083922677982084, "grad_norm": 0.08210758864879608, "learning_rate": 2.0300676006586462e-05, "loss": 0.0725, "num_input_tokens_seen": 25778752, "step": 25630 }, { "epoch": 12.086280056577086, "grad_norm": 0.09653712064027786, "learning_rate": 2.0290573756301544e-05, "loss": 0.1294, "num_input_tokens_seen": 25783936, "step": 25635 }, { "epoch": 12.088637435172089, "grad_norm": 0.5245264768600464, "learning_rate": 2.0280472303242525e-05, "loss": 0.1424, "num_input_tokens_seen": 25787968, "step": 25640 }, { "epoch": 12.090994813767091, "grad_norm": 0.04544582962989807, "learning_rate": 2.0270371649119412e-05, "loss": 0.0536, "num_input_tokens_seen": 25792992, "step": 25645 }, { "epoch": 12.093352192362094, "grad_norm": 0.8130510449409485, "learning_rate": 2.0260271795642083e-05, "loss": 0.0446, "num_input_tokens_seen": 25798144, "step": 25650 }, { "epoch": 12.095709570957096, "grad_norm": 0.7784454226493835, "learning_rate": 2.0250172744520258e-05, "loss": 0.1022, "num_input_tokens_seen": 25803488, "step": 25655 }, { "epoch": 12.098066949552098, "grad_norm": 0.0187683068215847, "learning_rate": 2.0240074497463536e-05, "loss": 0.0781, "num_input_tokens_seen": 25808384, "step": 25660 }, { "epoch": 12.100424328147101, "grad_norm": 0.34002986550331116, "learning_rate": 2.0229977056181383e-05, "loss": 0.0262, "num_input_tokens_seen": 25814816, "step": 25665 }, { "epoch": 12.102781706742103, "grad_norm": 0.3644821345806122, "learning_rate": 2.0219880422383135e-05, "loss": 0.1124, "num_input_tokens_seen": 25820128, "step": 25670 }, { "epoch": 12.105139085337106, "grad_norm": 0.8028044700622559, "learning_rate": 2.0209784597777958e-05, "loss": 0.3187, "num_input_tokens_seen": 25824864, "step": 25675 }, { "epoch": 12.107496463932108, "grad_norm": 0.39154261350631714, "learning_rate": 2.019968958407492e-05, "loss": 0.1648, "num_input_tokens_seen": 25830848, "step": 25680 }, { "epoch": 12.10985384252711, "grad_norm": 1.8387624025344849, "learning_rate": 2.0189595382982925e-05, "loss": 0.1634, "num_input_tokens_seen": 25835648, "step": 25685 }, { "epoch": 12.112211221122113, "grad_norm": 0.14927901327610016, "learning_rate": 2.0179501996210765e-05, "loss": 0.0811, "num_input_tokens_seen": 25839968, "step": 25690 }, { "epoch": 12.114568599717115, "grad_norm": 0.371696799993515, "learning_rate": 2.0169409425467063e-05, "loss": 0.0592, "num_input_tokens_seen": 25845344, "step": 25695 }, { "epoch": 12.116925978312118, "grad_norm": 0.21092528104782104, "learning_rate": 2.015931767246033e-05, "loss": 0.0581, "num_input_tokens_seen": 25849536, "step": 25700 }, { "epoch": 12.11928335690712, "grad_norm": 0.26333126425743103, "learning_rate": 2.014922673889892e-05, "loss": 0.1491, "num_input_tokens_seen": 25854688, "step": 25705 }, { "epoch": 12.121640735502123, "grad_norm": 1.38273024559021, "learning_rate": 2.013913662649107e-05, "loss": 0.0768, "num_input_tokens_seen": 25861088, "step": 25710 }, { "epoch": 12.123998114097123, "grad_norm": 0.09570011496543884, "learning_rate": 2.012904733694485e-05, "loss": 0.0484, "num_input_tokens_seen": 25866816, "step": 25715 }, { "epoch": 12.126355492692126, "grad_norm": 1.1231496334075928, "learning_rate": 2.011895887196821e-05, "loss": 0.1246, "num_input_tokens_seen": 25872032, "step": 25720 }, { "epoch": 12.128712871287128, "grad_norm": 0.16080130636692047, "learning_rate": 2.0108871233268964e-05, "loss": 0.155, "num_input_tokens_seen": 25876896, "step": 25725 }, { "epoch": 12.13107024988213, "grad_norm": 1.472704529762268, "learning_rate": 2.0098784422554774e-05, "loss": 0.0631, "num_input_tokens_seen": 25882880, "step": 25730 }, { "epoch": 12.133427628477133, "grad_norm": 0.9042519330978394, "learning_rate": 2.0088698441533153e-05, "loss": 0.1147, "num_input_tokens_seen": 25888544, "step": 25735 }, { "epoch": 12.135785007072135, "grad_norm": 0.06405794620513916, "learning_rate": 2.00786132919115e-05, "loss": 0.0084, "num_input_tokens_seen": 25893376, "step": 25740 }, { "epoch": 12.138142385667138, "grad_norm": 0.6981691122055054, "learning_rate": 2.006852897539706e-05, "loss": 0.0735, "num_input_tokens_seen": 25898016, "step": 25745 }, { "epoch": 12.14049976426214, "grad_norm": 0.04976996034383774, "learning_rate": 2.0058445493696935e-05, "loss": 0.1667, "num_input_tokens_seen": 25902784, "step": 25750 }, { "epoch": 12.142857142857142, "grad_norm": 0.07147976756095886, "learning_rate": 2.0048362848518076e-05, "loss": 0.0946, "num_input_tokens_seen": 25907840, "step": 25755 }, { "epoch": 12.145214521452145, "grad_norm": 0.10291348397731781, "learning_rate": 2.003828104156732e-05, "loss": 0.2502, "num_input_tokens_seen": 25912544, "step": 25760 }, { "epoch": 12.147571900047147, "grad_norm": 0.12468216568231583, "learning_rate": 2.0028200074551333e-05, "loss": 0.0952, "num_input_tokens_seen": 25917760, "step": 25765 }, { "epoch": 12.14992927864215, "grad_norm": 1.3730950355529785, "learning_rate": 2.0018119949176663e-05, "loss": 0.2085, "num_input_tokens_seen": 25923008, "step": 25770 }, { "epoch": 12.152286657237152, "grad_norm": 0.18596501648426056, "learning_rate": 2.0008040667149694e-05, "loss": 0.1777, "num_input_tokens_seen": 25927936, "step": 25775 }, { "epoch": 12.154644035832154, "grad_norm": 0.4865700304508209, "learning_rate": 1.999796223017668e-05, "loss": 0.101, "num_input_tokens_seen": 25933056, "step": 25780 }, { "epoch": 12.157001414427157, "grad_norm": 1.069319248199463, "learning_rate": 1.9987884639963734e-05, "loss": 0.0661, "num_input_tokens_seen": 25938368, "step": 25785 }, { "epoch": 12.15935879302216, "grad_norm": 3.2475931644439697, "learning_rate": 1.997780789821682e-05, "loss": 0.1263, "num_input_tokens_seen": 25942976, "step": 25790 }, { "epoch": 12.161716171617162, "grad_norm": 0.09846047312021255, "learning_rate": 1.9967732006641754e-05, "loss": 0.0912, "num_input_tokens_seen": 25947520, "step": 25795 }, { "epoch": 12.164073550212164, "grad_norm": 0.10943443328142166, "learning_rate": 1.9957656966944213e-05, "loss": 0.1491, "num_input_tokens_seen": 25953024, "step": 25800 }, { "epoch": 12.166430928807166, "grad_norm": 0.10614820569753647, "learning_rate": 1.9947582780829736e-05, "loss": 0.0247, "num_input_tokens_seen": 25957792, "step": 25805 }, { "epoch": 12.168788307402169, "grad_norm": 0.047745734453201294, "learning_rate": 1.9937509450003714e-05, "loss": 0.0732, "num_input_tokens_seen": 25963584, "step": 25810 }, { "epoch": 12.171145685997171, "grad_norm": 0.15604563057422638, "learning_rate": 1.992743697617138e-05, "loss": 0.0128, "num_input_tokens_seen": 25968224, "step": 25815 }, { "epoch": 12.173503064592174, "grad_norm": 0.50341796875, "learning_rate": 1.9917365361037837e-05, "loss": 0.1839, "num_input_tokens_seen": 25973344, "step": 25820 }, { "epoch": 12.175860443187176, "grad_norm": 0.5651980042457581, "learning_rate": 1.9907294606308042e-05, "loss": 0.102, "num_input_tokens_seen": 25978560, "step": 25825 }, { "epoch": 12.178217821782178, "grad_norm": 2.607980489730835, "learning_rate": 1.9897224713686806e-05, "loss": 0.0609, "num_input_tokens_seen": 25983040, "step": 25830 }, { "epoch": 12.18057520037718, "grad_norm": 0.2339956909418106, "learning_rate": 1.9887155684878785e-05, "loss": 0.0449, "num_input_tokens_seen": 25989440, "step": 25835 }, { "epoch": 12.182932578972183, "grad_norm": 0.08788498491048813, "learning_rate": 1.98770875215885e-05, "loss": 0.0724, "num_input_tokens_seen": 25995936, "step": 25840 }, { "epoch": 12.185289957567186, "grad_norm": 1.398773193359375, "learning_rate": 1.9867020225520306e-05, "loss": 0.1164, "num_input_tokens_seen": 26000352, "step": 25845 }, { "epoch": 12.187647336162188, "grad_norm": 0.45142289996147156, "learning_rate": 1.9856953798378434e-05, "loss": 0.0662, "num_input_tokens_seen": 26007200, "step": 25850 }, { "epoch": 12.19000471475719, "grad_norm": 0.667288601398468, "learning_rate": 1.9846888241866963e-05, "loss": 0.214, "num_input_tokens_seen": 26011584, "step": 25855 }, { "epoch": 12.192362093352193, "grad_norm": 0.1159236878156662, "learning_rate": 1.9836823557689822e-05, "loss": 0.0948, "num_input_tokens_seen": 26016384, "step": 25860 }, { "epoch": 12.194719471947195, "grad_norm": 0.07205571979284286, "learning_rate": 1.9826759747550777e-05, "loss": 0.0449, "num_input_tokens_seen": 26022112, "step": 25865 }, { "epoch": 12.197076850542198, "grad_norm": 0.3227495551109314, "learning_rate": 1.9816696813153478e-05, "loss": 0.1266, "num_input_tokens_seen": 26028704, "step": 25870 }, { "epoch": 12.1994342291372, "grad_norm": 0.13549235463142395, "learning_rate": 1.9806634756201394e-05, "loss": 0.113, "num_input_tokens_seen": 26032736, "step": 25875 }, { "epoch": 12.201791607732202, "grad_norm": 1.4441663026809692, "learning_rate": 1.9796573578397877e-05, "loss": 0.2518, "num_input_tokens_seen": 26037664, "step": 25880 }, { "epoch": 12.204148986327205, "grad_norm": 0.11415909230709076, "learning_rate": 1.9786513281446097e-05, "loss": 0.0828, "num_input_tokens_seen": 26045152, "step": 25885 }, { "epoch": 12.206506364922207, "grad_norm": 1.6490615606307983, "learning_rate": 1.97764538670491e-05, "loss": 0.1524, "num_input_tokens_seen": 26051488, "step": 25890 }, { "epoch": 12.20886374351721, "grad_norm": 0.05062340945005417, "learning_rate": 1.976639533690977e-05, "loss": 0.0494, "num_input_tokens_seen": 26057184, "step": 25895 }, { "epoch": 12.211221122112212, "grad_norm": 0.30918848514556885, "learning_rate": 1.9756337692730856e-05, "loss": 0.0255, "num_input_tokens_seen": 26061440, "step": 25900 }, { "epoch": 12.213578500707214, "grad_norm": 0.3469867706298828, "learning_rate": 1.974628093621493e-05, "loss": 0.0454, "num_input_tokens_seen": 26066304, "step": 25905 }, { "epoch": 12.215935879302217, "grad_norm": 1.1132649183273315, "learning_rate": 1.973622506906444e-05, "loss": 0.0846, "num_input_tokens_seen": 26071648, "step": 25910 }, { "epoch": 12.218293257897217, "grad_norm": 2.1842355728149414, "learning_rate": 1.9726170092981675e-05, "loss": 0.2236, "num_input_tokens_seen": 26076160, "step": 25915 }, { "epoch": 12.22065063649222, "grad_norm": 0.35346555709838867, "learning_rate": 1.971611600966877e-05, "loss": 0.2046, "num_input_tokens_seen": 26080736, "step": 25920 }, { "epoch": 12.223008015087222, "grad_norm": 0.7474702596664429, "learning_rate": 1.9706062820827707e-05, "loss": 0.1554, "num_input_tokens_seen": 26086528, "step": 25925 }, { "epoch": 12.225365393682225, "grad_norm": 0.9708106517791748, "learning_rate": 1.969601052816032e-05, "loss": 0.1471, "num_input_tokens_seen": 26091072, "step": 25930 }, { "epoch": 12.227722772277227, "grad_norm": 0.14402739703655243, "learning_rate": 1.9685959133368294e-05, "loss": 0.0138, "num_input_tokens_seen": 26094816, "step": 25935 }, { "epoch": 12.23008015087223, "grad_norm": 3.890791893005371, "learning_rate": 1.9675908638153167e-05, "loss": 0.1036, "num_input_tokens_seen": 26098912, "step": 25940 }, { "epoch": 12.232437529467232, "grad_norm": 1.03544020652771, "learning_rate": 1.96658590442163e-05, "loss": 0.1777, "num_input_tokens_seen": 26104096, "step": 25945 }, { "epoch": 12.234794908062234, "grad_norm": 0.08235906809568405, "learning_rate": 1.9655810353258933e-05, "loss": 0.0487, "num_input_tokens_seen": 26109344, "step": 25950 }, { "epoch": 12.237152286657237, "grad_norm": 0.3920053541660309, "learning_rate": 1.964576256698213e-05, "loss": 0.0448, "num_input_tokens_seen": 26115136, "step": 25955 }, { "epoch": 12.239509665252239, "grad_norm": 3.1042587757110596, "learning_rate": 1.9635715687086825e-05, "loss": 0.1183, "num_input_tokens_seen": 26120608, "step": 25960 }, { "epoch": 12.241867043847241, "grad_norm": 0.445168673992157, "learning_rate": 1.9625669715273763e-05, "loss": 0.0507, "num_input_tokens_seen": 26124800, "step": 25965 }, { "epoch": 12.244224422442244, "grad_norm": 0.08452821522951126, "learning_rate": 1.9615624653243574e-05, "loss": 0.2242, "num_input_tokens_seen": 26129888, "step": 25970 }, { "epoch": 12.246581801037246, "grad_norm": 0.04558582603931427, "learning_rate": 1.9605580502696703e-05, "loss": 0.0329, "num_input_tokens_seen": 26134688, "step": 25975 }, { "epoch": 12.248939179632249, "grad_norm": 0.15201422572135925, "learning_rate": 1.9595537265333467e-05, "loss": 0.0276, "num_input_tokens_seen": 26138848, "step": 25980 }, { "epoch": 12.251296558227251, "grad_norm": 0.19350266456604004, "learning_rate": 1.9585494942854008e-05, "loss": 0.1236, "num_input_tokens_seen": 26146432, "step": 25985 }, { "epoch": 12.253653936822253, "grad_norm": 0.8631453514099121, "learning_rate": 1.9575453536958323e-05, "loss": 0.0647, "num_input_tokens_seen": 26151200, "step": 25990 }, { "epoch": 12.256011315417256, "grad_norm": 1.5191749334335327, "learning_rate": 1.9565413049346242e-05, "loss": 0.0955, "num_input_tokens_seen": 26156384, "step": 25995 }, { "epoch": 12.258368694012258, "grad_norm": 2.2034409046173096, "learning_rate": 1.955537348171747e-05, "loss": 0.0888, "num_input_tokens_seen": 26160192, "step": 26000 }, { "epoch": 12.26072607260726, "grad_norm": 2.191352367401123, "learning_rate": 1.9545334835771516e-05, "loss": 0.1209, "num_input_tokens_seen": 26164864, "step": 26005 }, { "epoch": 12.263083451202263, "grad_norm": 0.19992344081401825, "learning_rate": 1.9535297113207764e-05, "loss": 0.1046, "num_input_tokens_seen": 26170432, "step": 26010 }, { "epoch": 12.265440829797265, "grad_norm": 0.09213367849588394, "learning_rate": 1.952526031572542e-05, "loss": 0.0125, "num_input_tokens_seen": 26175168, "step": 26015 }, { "epoch": 12.267798208392268, "grad_norm": 0.9841488599777222, "learning_rate": 1.9515224445023553e-05, "loss": 0.0951, "num_input_tokens_seen": 26179168, "step": 26020 }, { "epoch": 12.27015558698727, "grad_norm": 0.4774879515171051, "learning_rate": 1.950518950280106e-05, "loss": 0.1471, "num_input_tokens_seen": 26184800, "step": 26025 }, { "epoch": 12.272512965582273, "grad_norm": 0.27184370160102844, "learning_rate": 1.9495155490756683e-05, "loss": 0.0538, "num_input_tokens_seen": 26190720, "step": 26030 }, { "epoch": 12.274870344177275, "grad_norm": 0.7568536996841431, "learning_rate": 1.9485122410589017e-05, "loss": 0.1673, "num_input_tokens_seen": 26195872, "step": 26035 }, { "epoch": 12.277227722772277, "grad_norm": 0.5816645622253418, "learning_rate": 1.9475090263996495e-05, "loss": 0.2574, "num_input_tokens_seen": 26200800, "step": 26040 }, { "epoch": 12.27958510136728, "grad_norm": 0.38001132011413574, "learning_rate": 1.946505905267738e-05, "loss": 0.0496, "num_input_tokens_seen": 26206400, "step": 26045 }, { "epoch": 12.281942479962282, "grad_norm": 0.05066158249974251, "learning_rate": 1.9455028778329786e-05, "loss": 0.0787, "num_input_tokens_seen": 26210464, "step": 26050 }, { "epoch": 12.284299858557285, "grad_norm": 0.05501088127493858, "learning_rate": 1.9444999442651675e-05, "loss": 0.0326, "num_input_tokens_seen": 26215584, "step": 26055 }, { "epoch": 12.286657237152287, "grad_norm": 0.06141791120171547, "learning_rate": 1.943497104734085e-05, "loss": 0.0725, "num_input_tokens_seen": 26223136, "step": 26060 }, { "epoch": 12.28901461574729, "grad_norm": 0.5703955292701721, "learning_rate": 1.9424943594094925e-05, "loss": 0.0497, "num_input_tokens_seen": 26227936, "step": 26065 }, { "epoch": 12.291371994342292, "grad_norm": 0.22203898429870605, "learning_rate": 1.9414917084611398e-05, "loss": 0.0818, "num_input_tokens_seen": 26233056, "step": 26070 }, { "epoch": 12.293729372937294, "grad_norm": 0.7470673322677612, "learning_rate": 1.9404891520587574e-05, "loss": 0.0738, "num_input_tokens_seen": 26237632, "step": 26075 }, { "epoch": 12.296086751532297, "grad_norm": 0.7021785974502563, "learning_rate": 1.9394866903720614e-05, "loss": 0.2118, "num_input_tokens_seen": 26242976, "step": 26080 }, { "epoch": 12.298444130127299, "grad_norm": 0.26013919711112976, "learning_rate": 1.9384843235707516e-05, "loss": 0.0293, "num_input_tokens_seen": 26248864, "step": 26085 }, { "epoch": 12.300801508722301, "grad_norm": 2.0235493183135986, "learning_rate": 1.937482051824513e-05, "loss": 0.1933, "num_input_tokens_seen": 26254464, "step": 26090 }, { "epoch": 12.303158887317304, "grad_norm": 2.4360201358795166, "learning_rate": 1.9364798753030107e-05, "loss": 0.1624, "num_input_tokens_seen": 26259072, "step": 26095 }, { "epoch": 12.305516265912306, "grad_norm": 1.3592474460601807, "learning_rate": 1.935477794175898e-05, "loss": 0.0619, "num_input_tokens_seen": 26264224, "step": 26100 }, { "epoch": 12.307873644507309, "grad_norm": 0.29824909567832947, "learning_rate": 1.9344758086128096e-05, "loss": 0.1082, "num_input_tokens_seen": 26268896, "step": 26105 }, { "epoch": 12.310231023102311, "grad_norm": 1.4654033184051514, "learning_rate": 1.9334739187833652e-05, "loss": 0.3559, "num_input_tokens_seen": 26274848, "step": 26110 }, { "epoch": 12.312588401697312, "grad_norm": 1.6627384424209595, "learning_rate": 1.9324721248571667e-05, "loss": 0.2551, "num_input_tokens_seen": 26279680, "step": 26115 }, { "epoch": 12.314945780292314, "grad_norm": 0.08181659877300262, "learning_rate": 1.9314704270038014e-05, "loss": 0.109, "num_input_tokens_seen": 26284832, "step": 26120 }, { "epoch": 12.317303158887317, "grad_norm": 1.3783444166183472, "learning_rate": 1.9304688253928398e-05, "loss": 0.1924, "num_input_tokens_seen": 26290464, "step": 26125 }, { "epoch": 12.319660537482319, "grad_norm": 0.026996884495019913, "learning_rate": 1.9294673201938367e-05, "loss": 0.023, "num_input_tokens_seen": 26295360, "step": 26130 }, { "epoch": 12.322017916077321, "grad_norm": 0.16432850062847137, "learning_rate": 1.9284659115763284e-05, "loss": 0.0114, "num_input_tokens_seen": 26300096, "step": 26135 }, { "epoch": 12.324375294672324, "grad_norm": 0.18603254854679108, "learning_rate": 1.9274645997098373e-05, "loss": 0.0583, "num_input_tokens_seen": 26305504, "step": 26140 }, { "epoch": 12.326732673267326, "grad_norm": 1.3946244716644287, "learning_rate": 1.9264633847638686e-05, "loss": 0.1144, "num_input_tokens_seen": 26311776, "step": 26145 }, { "epoch": 12.329090051862329, "grad_norm": 0.15135574340820312, "learning_rate": 1.9254622669079108e-05, "loss": 0.2091, "num_input_tokens_seen": 26317216, "step": 26150 }, { "epoch": 12.331447430457331, "grad_norm": 0.37423765659332275, "learning_rate": 1.9244612463114357e-05, "loss": 0.129, "num_input_tokens_seen": 26321920, "step": 26155 }, { "epoch": 12.333804809052333, "grad_norm": 1.1846423149108887, "learning_rate": 1.9234603231438995e-05, "loss": 0.0705, "num_input_tokens_seen": 26326176, "step": 26160 }, { "epoch": 12.336162187647336, "grad_norm": 1.5023279190063477, "learning_rate": 1.922459497574742e-05, "loss": 0.0382, "num_input_tokens_seen": 26331200, "step": 26165 }, { "epoch": 12.338519566242338, "grad_norm": 0.21488019824028015, "learning_rate": 1.9214587697733854e-05, "loss": 0.0154, "num_input_tokens_seen": 26335456, "step": 26170 }, { "epoch": 12.34087694483734, "grad_norm": 0.5021994113922119, "learning_rate": 1.9204581399092357e-05, "loss": 0.14, "num_input_tokens_seen": 26340096, "step": 26175 }, { "epoch": 12.343234323432343, "grad_norm": 0.2653580904006958, "learning_rate": 1.919457608151683e-05, "loss": 0.1437, "num_input_tokens_seen": 26345760, "step": 26180 }, { "epoch": 12.345591702027345, "grad_norm": 1.1875284910202026, "learning_rate": 1.9184571746701e-05, "loss": 0.0954, "num_input_tokens_seen": 26350752, "step": 26185 }, { "epoch": 12.347949080622348, "grad_norm": 0.2016352117061615, "learning_rate": 1.917456839633844e-05, "loss": 0.0317, "num_input_tokens_seen": 26356480, "step": 26190 }, { "epoch": 12.35030645921735, "grad_norm": 0.08276023715734482, "learning_rate": 1.916456603212253e-05, "loss": 0.0273, "num_input_tokens_seen": 26361792, "step": 26195 }, { "epoch": 12.352663837812353, "grad_norm": 2.157674789428711, "learning_rate": 1.9154564655746517e-05, "loss": 0.1365, "num_input_tokens_seen": 26366720, "step": 26200 }, { "epoch": 12.355021216407355, "grad_norm": 2.0381765365600586, "learning_rate": 1.9144564268903455e-05, "loss": 0.2079, "num_input_tokens_seen": 26372160, "step": 26205 }, { "epoch": 12.357378595002357, "grad_norm": 0.025621574372053146, "learning_rate": 1.913456487328625e-05, "loss": 0.1809, "num_input_tokens_seen": 26376992, "step": 26210 }, { "epoch": 12.35973597359736, "grad_norm": 0.9552068710327148, "learning_rate": 1.9124566470587616e-05, "loss": 0.1786, "num_input_tokens_seen": 26381184, "step": 26215 }, { "epoch": 12.362093352192362, "grad_norm": 1.0455586910247803, "learning_rate": 1.911456906250012e-05, "loss": 0.1136, "num_input_tokens_seen": 26386272, "step": 26220 }, { "epoch": 12.364450730787365, "grad_norm": 0.5230002403259277, "learning_rate": 1.9104572650716148e-05, "loss": 0.2963, "num_input_tokens_seen": 26391104, "step": 26225 }, { "epoch": 12.366808109382367, "grad_norm": 0.3745039701461792, "learning_rate": 1.9094577236927938e-05, "loss": 0.0096, "num_input_tokens_seen": 26395648, "step": 26230 }, { "epoch": 12.36916548797737, "grad_norm": 1.5273983478546143, "learning_rate": 1.9084582822827525e-05, "loss": 0.2489, "num_input_tokens_seen": 26399936, "step": 26235 }, { "epoch": 12.371522866572372, "grad_norm": 0.05927072837948799, "learning_rate": 1.9074589410106806e-05, "loss": 0.0204, "num_input_tokens_seen": 26406368, "step": 26240 }, { "epoch": 12.373880245167374, "grad_norm": 0.5586152076721191, "learning_rate": 1.906459700045749e-05, "loss": 0.2097, "num_input_tokens_seen": 26411136, "step": 26245 }, { "epoch": 12.376237623762377, "grad_norm": 0.22496210038661957, "learning_rate": 1.9054605595571128e-05, "loss": 0.0954, "num_input_tokens_seen": 26415968, "step": 26250 }, { "epoch": 12.378595002357379, "grad_norm": 0.41200006008148193, "learning_rate": 1.9044615197139087e-05, "loss": 0.0923, "num_input_tokens_seen": 26421632, "step": 26255 }, { "epoch": 12.380952380952381, "grad_norm": 0.779746413230896, "learning_rate": 1.9034625806852578e-05, "loss": 0.0509, "num_input_tokens_seen": 26427264, "step": 26260 }, { "epoch": 12.383309759547384, "grad_norm": 0.18009452521800995, "learning_rate": 1.9024637426402632e-05, "loss": 0.0671, "num_input_tokens_seen": 26432576, "step": 26265 }, { "epoch": 12.385667138142386, "grad_norm": 1.075073003768921, "learning_rate": 1.9014650057480123e-05, "loss": 0.1863, "num_input_tokens_seen": 26437728, "step": 26270 }, { "epoch": 12.388024516737389, "grad_norm": 0.07066333293914795, "learning_rate": 1.9004663701775724e-05, "loss": 0.0176, "num_input_tokens_seen": 26441696, "step": 26275 }, { "epoch": 12.390381895332391, "grad_norm": 0.10011280328035355, "learning_rate": 1.8994678360979966e-05, "loss": 0.0817, "num_input_tokens_seen": 26445632, "step": 26280 }, { "epoch": 12.392739273927393, "grad_norm": 0.12400981783866882, "learning_rate": 1.898469403678321e-05, "loss": 0.0623, "num_input_tokens_seen": 26452128, "step": 26285 }, { "epoch": 12.395096652522396, "grad_norm": 0.10155995190143585, "learning_rate": 1.8974710730875606e-05, "loss": 0.1016, "num_input_tokens_seen": 26457568, "step": 26290 }, { "epoch": 12.397454031117398, "grad_norm": 0.7141211032867432, "learning_rate": 1.896472844494718e-05, "loss": 0.0334, "num_input_tokens_seen": 26462336, "step": 26295 }, { "epoch": 12.3998114097124, "grad_norm": 0.20132014155387878, "learning_rate": 1.895474718068775e-05, "loss": 0.0358, "num_input_tokens_seen": 26467488, "step": 26300 }, { "epoch": 12.402168788307403, "grad_norm": 0.07192229479551315, "learning_rate": 1.8944766939786977e-05, "loss": 0.2136, "num_input_tokens_seen": 26473056, "step": 26305 }, { "epoch": 12.404526166902404, "grad_norm": 0.3855345845222473, "learning_rate": 1.893478772393435e-05, "loss": 0.1166, "num_input_tokens_seen": 26478336, "step": 26310 }, { "epoch": 12.406883545497408, "grad_norm": 1.4167314767837524, "learning_rate": 1.892480953481918e-05, "loss": 0.0617, "num_input_tokens_seen": 26482688, "step": 26315 }, { "epoch": 12.409240924092408, "grad_norm": 1.7515360116958618, "learning_rate": 1.891483237413061e-05, "loss": 0.2265, "num_input_tokens_seen": 26487840, "step": 26320 }, { "epoch": 12.41159830268741, "grad_norm": 0.10461917519569397, "learning_rate": 1.890485624355759e-05, "loss": 0.0407, "num_input_tokens_seen": 26493248, "step": 26325 }, { "epoch": 12.413955681282413, "grad_norm": 1.2750508785247803, "learning_rate": 1.8894881144788924e-05, "loss": 0.1099, "num_input_tokens_seen": 26498304, "step": 26330 }, { "epoch": 12.416313059877416, "grad_norm": 2.4427247047424316, "learning_rate": 1.8884907079513224e-05, "loss": 0.1713, "num_input_tokens_seen": 26504160, "step": 26335 }, { "epoch": 12.418670438472418, "grad_norm": 0.18022190034389496, "learning_rate": 1.8874934049418925e-05, "loss": 0.0656, "num_input_tokens_seen": 26508832, "step": 26340 }, { "epoch": 12.42102781706742, "grad_norm": 0.8508450388908386, "learning_rate": 1.886496205619429e-05, "loss": 0.0678, "num_input_tokens_seen": 26513472, "step": 26345 }, { "epoch": 12.423385195662423, "grad_norm": 1.0801897048950195, "learning_rate": 1.8854991101527412e-05, "loss": 0.0768, "num_input_tokens_seen": 26517888, "step": 26350 }, { "epoch": 12.425742574257425, "grad_norm": 0.10023301094770432, "learning_rate": 1.884502118710621e-05, "loss": 0.081, "num_input_tokens_seen": 26523840, "step": 26355 }, { "epoch": 12.428099952852428, "grad_norm": 0.9345749616622925, "learning_rate": 1.883505231461841e-05, "loss": 0.1915, "num_input_tokens_seen": 26527712, "step": 26360 }, { "epoch": 12.43045733144743, "grad_norm": 1.5033714771270752, "learning_rate": 1.882508448575158e-05, "loss": 0.1497, "num_input_tokens_seen": 26532864, "step": 26365 }, { "epoch": 12.432814710042432, "grad_norm": 0.9796621203422546, "learning_rate": 1.88151177021931e-05, "loss": 0.0965, "num_input_tokens_seen": 26538240, "step": 26370 }, { "epoch": 12.435172088637435, "grad_norm": 0.1352817565202713, "learning_rate": 1.8805151965630184e-05, "loss": 0.1476, "num_input_tokens_seen": 26543168, "step": 26375 }, { "epoch": 12.437529467232437, "grad_norm": 0.14116312563419342, "learning_rate": 1.8795187277749856e-05, "loss": 0.185, "num_input_tokens_seen": 26548000, "step": 26380 }, { "epoch": 12.43988684582744, "grad_norm": 0.08236421644687653, "learning_rate": 1.8785223640238968e-05, "loss": 0.1658, "num_input_tokens_seen": 26553152, "step": 26385 }, { "epoch": 12.442244224422442, "grad_norm": 0.8933995962142944, "learning_rate": 1.87752610547842e-05, "loss": 0.0749, "num_input_tokens_seen": 26558080, "step": 26390 }, { "epoch": 12.444601603017444, "grad_norm": 0.14945891499519348, "learning_rate": 1.8765299523072046e-05, "loss": 0.0839, "num_input_tokens_seen": 26562880, "step": 26395 }, { "epoch": 12.446958981612447, "grad_norm": 0.3916858732700348, "learning_rate": 1.8755339046788823e-05, "loss": 0.0115, "num_input_tokens_seen": 26567584, "step": 26400 }, { "epoch": 12.44931636020745, "grad_norm": 0.5069494843482971, "learning_rate": 1.8745379627620668e-05, "loss": 0.0555, "num_input_tokens_seen": 26573152, "step": 26405 }, { "epoch": 12.451673738802452, "grad_norm": 2.236811637878418, "learning_rate": 1.8735421267253546e-05, "loss": 0.1749, "num_input_tokens_seen": 26577568, "step": 26410 }, { "epoch": 12.454031117397454, "grad_norm": 1.2437444925308228, "learning_rate": 1.872546396737324e-05, "loss": 0.1447, "num_input_tokens_seen": 26581888, "step": 26415 }, { "epoch": 12.456388495992456, "grad_norm": 0.30875158309936523, "learning_rate": 1.8715507729665345e-05, "loss": 0.0514, "num_input_tokens_seen": 26586784, "step": 26420 }, { "epoch": 12.458745874587459, "grad_norm": 0.13806389272212982, "learning_rate": 1.8705552555815286e-05, "loss": 0.0624, "num_input_tokens_seen": 26591264, "step": 26425 }, { "epoch": 12.461103253182461, "grad_norm": 0.03906962648034096, "learning_rate": 1.8695598447508305e-05, "loss": 0.1076, "num_input_tokens_seen": 26596736, "step": 26430 }, { "epoch": 12.463460631777464, "grad_norm": 1.21387779712677, "learning_rate": 1.868564540642947e-05, "loss": 0.2259, "num_input_tokens_seen": 26602080, "step": 26435 }, { "epoch": 12.465818010372466, "grad_norm": 2.21094012260437, "learning_rate": 1.8675693434263654e-05, "loss": 0.1654, "num_input_tokens_seen": 26607456, "step": 26440 }, { "epoch": 12.468175388967468, "grad_norm": 1.5556658506393433, "learning_rate": 1.8665742532695556e-05, "loss": 0.0884, "num_input_tokens_seen": 26612256, "step": 26445 }, { "epoch": 12.47053276756247, "grad_norm": 0.49731963872909546, "learning_rate": 1.86557927034097e-05, "loss": 0.0561, "num_input_tokens_seen": 26618016, "step": 26450 }, { "epoch": 12.472890146157473, "grad_norm": 2.143582820892334, "learning_rate": 1.8645843948090423e-05, "loss": 0.2689, "num_input_tokens_seen": 26622464, "step": 26455 }, { "epoch": 12.475247524752476, "grad_norm": 2.126502513885498, "learning_rate": 1.863589626842187e-05, "loss": 0.0698, "num_input_tokens_seen": 26627840, "step": 26460 }, { "epoch": 12.477604903347478, "grad_norm": 0.03278864920139313, "learning_rate": 1.862594966608803e-05, "loss": 0.0454, "num_input_tokens_seen": 26632640, "step": 26465 }, { "epoch": 12.47996228194248, "grad_norm": 0.07435809075832367, "learning_rate": 1.8616004142772682e-05, "loss": 0.0631, "num_input_tokens_seen": 26637888, "step": 26470 }, { "epoch": 12.482319660537483, "grad_norm": 0.7183917164802551, "learning_rate": 1.8606059700159446e-05, "loss": 0.048, "num_input_tokens_seen": 26642912, "step": 26475 }, { "epoch": 12.484677039132485, "grad_norm": 0.9528118371963501, "learning_rate": 1.8596116339931733e-05, "loss": 0.0727, "num_input_tokens_seen": 26647488, "step": 26480 }, { "epoch": 12.487034417727488, "grad_norm": 0.04166156426072121, "learning_rate": 1.858617406377279e-05, "loss": 0.0296, "num_input_tokens_seen": 26652128, "step": 26485 }, { "epoch": 12.48939179632249, "grad_norm": 0.11978056281805038, "learning_rate": 1.8576232873365678e-05, "loss": 0.1105, "num_input_tokens_seen": 26657120, "step": 26490 }, { "epoch": 12.491749174917492, "grad_norm": 0.3412957489490509, "learning_rate": 1.8566292770393278e-05, "loss": 0.0465, "num_input_tokens_seen": 26663136, "step": 26495 }, { "epoch": 12.494106553512495, "grad_norm": 0.1915649026632309, "learning_rate": 1.855635375653827e-05, "loss": 0.1163, "num_input_tokens_seen": 26668096, "step": 26500 }, { "epoch": 12.496463932107497, "grad_norm": 1.264589548110962, "learning_rate": 1.8546415833483165e-05, "loss": 0.0801, "num_input_tokens_seen": 26673792, "step": 26505 }, { "epoch": 12.4988213107025, "grad_norm": 0.04385491460561752, "learning_rate": 1.853647900291029e-05, "loss": 0.163, "num_input_tokens_seen": 26678208, "step": 26510 }, { "epoch": 12.5011786892975, "grad_norm": 0.09864217042922974, "learning_rate": 1.8526543266501767e-05, "loss": 0.0717, "num_input_tokens_seen": 26683232, "step": 26515 }, { "epoch": 12.503536067892503, "grad_norm": 0.27931153774261475, "learning_rate": 1.8516608625939557e-05, "loss": 0.0624, "num_input_tokens_seen": 26687360, "step": 26520 }, { "epoch": 12.505893446487505, "grad_norm": 1.3303364515304565, "learning_rate": 1.850667508290543e-05, "loss": 0.1794, "num_input_tokens_seen": 26692544, "step": 26525 }, { "epoch": 12.508250825082508, "grad_norm": 0.0669228807091713, "learning_rate": 1.8496742639080955e-05, "loss": 0.1964, "num_input_tokens_seen": 26697088, "step": 26530 }, { "epoch": 12.51060820367751, "grad_norm": 1.6520346403121948, "learning_rate": 1.8486811296147534e-05, "loss": 0.124, "num_input_tokens_seen": 26702176, "step": 26535 }, { "epoch": 12.512965582272512, "grad_norm": 0.06449121236801147, "learning_rate": 1.847688105578637e-05, "loss": 0.1671, "num_input_tokens_seen": 26707168, "step": 26540 }, { "epoch": 12.515322960867515, "grad_norm": 0.8066880702972412, "learning_rate": 1.846695191967849e-05, "loss": 0.041, "num_input_tokens_seen": 26711616, "step": 26545 }, { "epoch": 12.517680339462517, "grad_norm": 0.06070499122142792, "learning_rate": 1.8457023889504723e-05, "loss": 0.1358, "num_input_tokens_seen": 26717024, "step": 26550 }, { "epoch": 12.52003771805752, "grad_norm": 0.6734881401062012, "learning_rate": 1.8447096966945722e-05, "loss": 0.2376, "num_input_tokens_seen": 26721696, "step": 26555 }, { "epoch": 12.522395096652522, "grad_norm": 1.1666302680969238, "learning_rate": 1.8437171153681935e-05, "loss": 0.1227, "num_input_tokens_seen": 26726688, "step": 26560 }, { "epoch": 12.524752475247524, "grad_norm": 1.823142647743225, "learning_rate": 1.842724645139365e-05, "loss": 0.1945, "num_input_tokens_seen": 26731488, "step": 26565 }, { "epoch": 12.527109853842527, "grad_norm": 0.04045429825782776, "learning_rate": 1.8417322861760937e-05, "loss": 0.1259, "num_input_tokens_seen": 26737152, "step": 26570 }, { "epoch": 12.52946723243753, "grad_norm": 1.1819708347320557, "learning_rate": 1.8407400386463698e-05, "loss": 0.0812, "num_input_tokens_seen": 26741728, "step": 26575 }, { "epoch": 12.531824611032532, "grad_norm": 0.4535934329032898, "learning_rate": 1.8397479027181637e-05, "loss": 0.0633, "num_input_tokens_seen": 26746304, "step": 26580 }, { "epoch": 12.534181989627534, "grad_norm": 1.4787627458572388, "learning_rate": 1.838755878559427e-05, "loss": 0.0855, "num_input_tokens_seen": 26751232, "step": 26585 }, { "epoch": 12.536539368222536, "grad_norm": 0.0452314056456089, "learning_rate": 1.837763966338093e-05, "loss": 0.0297, "num_input_tokens_seen": 26756288, "step": 26590 }, { "epoch": 12.538896746817539, "grad_norm": 1.7520490884780884, "learning_rate": 1.8367721662220754e-05, "loss": 0.1954, "num_input_tokens_seen": 26760704, "step": 26595 }, { "epoch": 12.541254125412541, "grad_norm": 1.315650463104248, "learning_rate": 1.8357804783792688e-05, "loss": 0.12, "num_input_tokens_seen": 26765856, "step": 26600 }, { "epoch": 12.543611504007544, "grad_norm": 0.04573952779173851, "learning_rate": 1.83478890297755e-05, "loss": 0.1588, "num_input_tokens_seen": 26770784, "step": 26605 }, { "epoch": 12.545968882602546, "grad_norm": 2.573442220687866, "learning_rate": 1.833797440184775e-05, "loss": 0.1677, "num_input_tokens_seen": 26776928, "step": 26610 }, { "epoch": 12.548326261197548, "grad_norm": 0.4091942012310028, "learning_rate": 1.8328060901687815e-05, "loss": 0.111, "num_input_tokens_seen": 26782464, "step": 26615 }, { "epoch": 12.55068363979255, "grad_norm": 2.257809638977051, "learning_rate": 1.831814853097389e-05, "loss": 0.0713, "num_input_tokens_seen": 26787456, "step": 26620 }, { "epoch": 12.553041018387553, "grad_norm": 0.16725626587867737, "learning_rate": 1.830823729138397e-05, "loss": 0.029, "num_input_tokens_seen": 26792896, "step": 26625 }, { "epoch": 12.555398396982556, "grad_norm": 1.6390631198883057, "learning_rate": 1.8298327184595853e-05, "loss": 0.0474, "num_input_tokens_seen": 26797536, "step": 26630 }, { "epoch": 12.557755775577558, "grad_norm": 0.10767493396997452, "learning_rate": 1.8288418212287157e-05, "loss": 0.0256, "num_input_tokens_seen": 26802624, "step": 26635 }, { "epoch": 12.56011315417256, "grad_norm": 0.03531438484787941, "learning_rate": 1.8278510376135296e-05, "loss": 0.1222, "num_input_tokens_seen": 26807040, "step": 26640 }, { "epoch": 12.562470532767563, "grad_norm": 0.7496102452278137, "learning_rate": 1.8268603677817514e-05, "loss": 0.0485, "num_input_tokens_seen": 26811328, "step": 26645 }, { "epoch": 12.564827911362565, "grad_norm": 1.6539169549942017, "learning_rate": 1.825869811901083e-05, "loss": 0.1946, "num_input_tokens_seen": 26816768, "step": 26650 }, { "epoch": 12.567185289957568, "grad_norm": 0.5376436114311218, "learning_rate": 1.824879370139209e-05, "loss": 0.0358, "num_input_tokens_seen": 26821728, "step": 26655 }, { "epoch": 12.56954266855257, "grad_norm": 0.27063214778900146, "learning_rate": 1.8238890426637954e-05, "loss": 0.0438, "num_input_tokens_seen": 26826752, "step": 26660 }, { "epoch": 12.571900047147572, "grad_norm": 0.11918725073337555, "learning_rate": 1.8228988296424877e-05, "loss": 0.1072, "num_input_tokens_seen": 26831584, "step": 26665 }, { "epoch": 12.574257425742575, "grad_norm": 0.12391061335802078, "learning_rate": 1.8219087312429106e-05, "loss": 0.1822, "num_input_tokens_seen": 26835872, "step": 26670 }, { "epoch": 12.576614804337577, "grad_norm": 1.302678108215332, "learning_rate": 1.820918747632672e-05, "loss": 0.1285, "num_input_tokens_seen": 26840768, "step": 26675 }, { "epoch": 12.57897218293258, "grad_norm": 0.1766233593225479, "learning_rate": 1.8199288789793596e-05, "loss": 0.0376, "num_input_tokens_seen": 26845120, "step": 26680 }, { "epoch": 12.581329561527582, "grad_norm": 0.5618974566459656, "learning_rate": 1.8189391254505416e-05, "loss": 0.1594, "num_input_tokens_seen": 26850816, "step": 26685 }, { "epoch": 12.583686940122584, "grad_norm": 0.5116586685180664, "learning_rate": 1.817949487213765e-05, "loss": 0.0573, "num_input_tokens_seen": 26855968, "step": 26690 }, { "epoch": 12.586044318717587, "grad_norm": 0.04545127600431442, "learning_rate": 1.8169599644365597e-05, "loss": 0.0635, "num_input_tokens_seen": 26860832, "step": 26695 }, { "epoch": 12.58840169731259, "grad_norm": 0.6471052765846252, "learning_rate": 1.8159705572864356e-05, "loss": 0.1387, "num_input_tokens_seen": 26866112, "step": 26700 }, { "epoch": 12.590759075907592, "grad_norm": 0.1730050891637802, "learning_rate": 1.814981265930882e-05, "loss": 0.0171, "num_input_tokens_seen": 26872448, "step": 26705 }, { "epoch": 12.593116454502592, "grad_norm": 0.04087478294968605, "learning_rate": 1.8139920905373687e-05, "loss": 0.1264, "num_input_tokens_seen": 26876864, "step": 26710 }, { "epoch": 12.595473833097596, "grad_norm": 1.1015421152114868, "learning_rate": 1.813003031273347e-05, "loss": 0.0924, "num_input_tokens_seen": 26881056, "step": 26715 }, { "epoch": 12.597831211692597, "grad_norm": 0.20241416990756989, "learning_rate": 1.8120140883062472e-05, "loss": 0.1318, "num_input_tokens_seen": 26886912, "step": 26720 }, { "epoch": 12.6001885902876, "grad_norm": 0.3160002827644348, "learning_rate": 1.811025261803482e-05, "loss": 0.1318, "num_input_tokens_seen": 26891680, "step": 26725 }, { "epoch": 12.602545968882602, "grad_norm": 2.61942195892334, "learning_rate": 1.8100365519324402e-05, "loss": 0.18, "num_input_tokens_seen": 26896928, "step": 26730 }, { "epoch": 12.604903347477604, "grad_norm": 0.3568059802055359, "learning_rate": 1.8090479588604972e-05, "loss": 0.039, "num_input_tokens_seen": 26902048, "step": 26735 }, { "epoch": 12.607260726072607, "grad_norm": 0.0936395451426506, "learning_rate": 1.808059482755002e-05, "loss": 0.1198, "num_input_tokens_seen": 26907232, "step": 26740 }, { "epoch": 12.609618104667609, "grad_norm": 1.5663076639175415, "learning_rate": 1.8070711237832875e-05, "loss": 0.1528, "num_input_tokens_seen": 26912672, "step": 26745 }, { "epoch": 12.611975483262611, "grad_norm": 0.08821085840463638, "learning_rate": 1.806082882112667e-05, "loss": 0.0948, "num_input_tokens_seen": 26917280, "step": 26750 }, { "epoch": 12.614332861857614, "grad_norm": 0.7079405784606934, "learning_rate": 1.8050947579104326e-05, "loss": 0.0413, "num_input_tokens_seen": 26923424, "step": 26755 }, { "epoch": 12.616690240452616, "grad_norm": 0.08811229467391968, "learning_rate": 1.8041067513438564e-05, "loss": 0.1547, "num_input_tokens_seen": 26928576, "step": 26760 }, { "epoch": 12.619047619047619, "grad_norm": 0.8376411199569702, "learning_rate": 1.8031188625801916e-05, "loss": 0.1755, "num_input_tokens_seen": 26933792, "step": 26765 }, { "epoch": 12.621404997642621, "grad_norm": 0.6960923671722412, "learning_rate": 1.802131091786671e-05, "loss": 0.329, "num_input_tokens_seen": 26938304, "step": 26770 }, { "epoch": 12.623762376237623, "grad_norm": 2.1722512245178223, "learning_rate": 1.801143439130508e-05, "loss": 0.1959, "num_input_tokens_seen": 26941920, "step": 26775 }, { "epoch": 12.626119754832626, "grad_norm": 0.18706496059894562, "learning_rate": 1.8001559047788943e-05, "loss": 0.0282, "num_input_tokens_seen": 26946816, "step": 26780 }, { "epoch": 12.628477133427628, "grad_norm": 0.1814892590045929, "learning_rate": 1.7991684888990032e-05, "loss": 0.1827, "num_input_tokens_seen": 26951072, "step": 26785 }, { "epoch": 12.63083451202263, "grad_norm": 0.8955724835395813, "learning_rate": 1.7981811916579876e-05, "loss": 0.1855, "num_input_tokens_seen": 26955712, "step": 26790 }, { "epoch": 12.633191890617633, "grad_norm": 1.60872220993042, "learning_rate": 1.7971940132229804e-05, "loss": 0.1663, "num_input_tokens_seen": 26959648, "step": 26795 }, { "epoch": 12.635549269212635, "grad_norm": 0.21892905235290527, "learning_rate": 1.7962069537610938e-05, "loss": 0.0622, "num_input_tokens_seen": 26963552, "step": 26800 }, { "epoch": 12.637906647807638, "grad_norm": 0.18110020458698273, "learning_rate": 1.7952200134394195e-05, "loss": 0.0589, "num_input_tokens_seen": 26969152, "step": 26805 }, { "epoch": 12.64026402640264, "grad_norm": 1.0185843706130981, "learning_rate": 1.7942331924250308e-05, "loss": 0.0361, "num_input_tokens_seen": 26976064, "step": 26810 }, { "epoch": 12.642621404997643, "grad_norm": 0.24034740030765533, "learning_rate": 1.79324649088498e-05, "loss": 0.0492, "num_input_tokens_seen": 26980576, "step": 26815 }, { "epoch": 12.644978783592645, "grad_norm": 1.2098171710968018, "learning_rate": 1.7922599089862973e-05, "loss": 0.0851, "num_input_tokens_seen": 26985152, "step": 26820 }, { "epoch": 12.647336162187647, "grad_norm": 0.47787922620773315, "learning_rate": 1.791273446895996e-05, "loss": 0.1555, "num_input_tokens_seen": 26991616, "step": 26825 }, { "epoch": 12.64969354078265, "grad_norm": 2.083928346633911, "learning_rate": 1.790287104781066e-05, "loss": 0.1756, "num_input_tokens_seen": 26997568, "step": 26830 }, { "epoch": 12.652050919377652, "grad_norm": 0.08983921259641647, "learning_rate": 1.7893008828084795e-05, "loss": 0.0823, "num_input_tokens_seen": 27003840, "step": 26835 }, { "epoch": 12.654408297972655, "grad_norm": 0.27160385251045227, "learning_rate": 1.788314781145186e-05, "loss": 0.1157, "num_input_tokens_seen": 27010112, "step": 26840 }, { "epoch": 12.656765676567657, "grad_norm": 0.4658825695514679, "learning_rate": 1.7873287999581167e-05, "loss": 0.2873, "num_input_tokens_seen": 27016128, "step": 26845 }, { "epoch": 12.65912305516266, "grad_norm": 2.0715675354003906, "learning_rate": 1.7863429394141813e-05, "loss": 0.2137, "num_input_tokens_seen": 27020608, "step": 26850 }, { "epoch": 12.661480433757662, "grad_norm": 0.660693883895874, "learning_rate": 1.785357199680269e-05, "loss": 0.2033, "num_input_tokens_seen": 27025856, "step": 26855 }, { "epoch": 12.663837812352664, "grad_norm": 1.83384370803833, "learning_rate": 1.7843715809232487e-05, "loss": 0.0866, "num_input_tokens_seen": 27032256, "step": 26860 }, { "epoch": 12.666195190947667, "grad_norm": 1.679612398147583, "learning_rate": 1.7833860833099685e-05, "loss": 0.0642, "num_input_tokens_seen": 27036864, "step": 26865 }, { "epoch": 12.668552569542669, "grad_norm": 2.1805031299591064, "learning_rate": 1.7824007070072576e-05, "loss": 0.2172, "num_input_tokens_seen": 27042880, "step": 26870 }, { "epoch": 12.670909948137671, "grad_norm": 1.2551939487457275, "learning_rate": 1.7814154521819232e-05, "loss": 0.2135, "num_input_tokens_seen": 27048160, "step": 26875 }, { "epoch": 12.673267326732674, "grad_norm": 2.0490992069244385, "learning_rate": 1.7804303190007515e-05, "loss": 0.0988, "num_input_tokens_seen": 27053120, "step": 26880 }, { "epoch": 12.675624705327676, "grad_norm": 1.0508726835250854, "learning_rate": 1.779445307630509e-05, "loss": 0.2163, "num_input_tokens_seen": 27057920, "step": 26885 }, { "epoch": 12.677982083922679, "grad_norm": 0.36193251609802246, "learning_rate": 1.7784604182379417e-05, "loss": 0.2985, "num_input_tokens_seen": 27063936, "step": 26890 }, { "epoch": 12.680339462517681, "grad_norm": 1.157289981842041, "learning_rate": 1.777475650989775e-05, "loss": 0.0876, "num_input_tokens_seen": 27069536, "step": 26895 }, { "epoch": 12.682696841112683, "grad_norm": 1.004560112953186, "learning_rate": 1.7764910060527122e-05, "loss": 0.1442, "num_input_tokens_seen": 27073792, "step": 26900 }, { "epoch": 12.685054219707686, "grad_norm": 0.25977814197540283, "learning_rate": 1.7755064835934377e-05, "loss": 0.0627, "num_input_tokens_seen": 27078144, "step": 26905 }, { "epoch": 12.687411598302688, "grad_norm": 0.3105151355266571, "learning_rate": 1.774522083778614e-05, "loss": 0.0435, "num_input_tokens_seen": 27082656, "step": 26910 }, { "epoch": 12.689768976897689, "grad_norm": 0.6892459392547607, "learning_rate": 1.773537806774885e-05, "loss": 0.1408, "num_input_tokens_seen": 27087296, "step": 26915 }, { "epoch": 12.692126355492691, "grad_norm": 0.2588241994380951, "learning_rate": 1.7725536527488694e-05, "loss": 0.1804, "num_input_tokens_seen": 27091968, "step": 26920 }, { "epoch": 12.694483734087694, "grad_norm": 0.2702696621417999, "learning_rate": 1.771569621867169e-05, "loss": 0.2068, "num_input_tokens_seen": 27097312, "step": 26925 }, { "epoch": 12.696841112682696, "grad_norm": 0.8416301012039185, "learning_rate": 1.770585714296364e-05, "loss": 0.1225, "num_input_tokens_seen": 27102400, "step": 26930 }, { "epoch": 12.699198491277699, "grad_norm": 1.0110342502593994, "learning_rate": 1.7696019302030133e-05, "loss": 0.1266, "num_input_tokens_seen": 27106880, "step": 26935 }, { "epoch": 12.701555869872701, "grad_norm": 1.1951879262924194, "learning_rate": 1.7686182697536536e-05, "loss": 0.2216, "num_input_tokens_seen": 27112704, "step": 26940 }, { "epoch": 12.703913248467703, "grad_norm": 0.5936616063117981, "learning_rate": 1.7676347331148026e-05, "loss": 0.0628, "num_input_tokens_seen": 27116960, "step": 26945 }, { "epoch": 12.706270627062706, "grad_norm": 0.017712770029902458, "learning_rate": 1.7666513204529566e-05, "loss": 0.0471, "num_input_tokens_seen": 27121216, "step": 26950 }, { "epoch": 12.708628005657708, "grad_norm": 0.19582797586917877, "learning_rate": 1.765668031934591e-05, "loss": 0.0528, "num_input_tokens_seen": 27125600, "step": 26955 }, { "epoch": 12.71098538425271, "grad_norm": 0.12980252504348755, "learning_rate": 1.7646848677261595e-05, "loss": 0.2318, "num_input_tokens_seen": 27130784, "step": 26960 }, { "epoch": 12.713342762847713, "grad_norm": 1.2735297679901123, "learning_rate": 1.7637018279940947e-05, "loss": 0.1232, "num_input_tokens_seen": 27135136, "step": 26965 }, { "epoch": 12.715700141442715, "grad_norm": 2.4544222354888916, "learning_rate": 1.7627189129048085e-05, "loss": 0.2008, "num_input_tokens_seen": 27139232, "step": 26970 }, { "epoch": 12.718057520037718, "grad_norm": 0.5122026205062866, "learning_rate": 1.761736122624692e-05, "loss": 0.02, "num_input_tokens_seen": 27143808, "step": 26975 }, { "epoch": 12.72041489863272, "grad_norm": 1.8351573944091797, "learning_rate": 1.760753457320115e-05, "loss": 0.1425, "num_input_tokens_seen": 27148288, "step": 26980 }, { "epoch": 12.722772277227723, "grad_norm": 1.4478065967559814, "learning_rate": 1.7597709171574263e-05, "loss": 0.113, "num_input_tokens_seen": 27153088, "step": 26985 }, { "epoch": 12.725129655822725, "grad_norm": 0.9679127931594849, "learning_rate": 1.7587885023029525e-05, "loss": 0.1922, "num_input_tokens_seen": 27157888, "step": 26990 }, { "epoch": 12.727487034417727, "grad_norm": 0.12409940361976624, "learning_rate": 1.757806212923e-05, "loss": 0.0365, "num_input_tokens_seen": 27162592, "step": 26995 }, { "epoch": 12.72984441301273, "grad_norm": 1.8194819688796997, "learning_rate": 1.7568240491838546e-05, "loss": 0.1694, "num_input_tokens_seen": 27167328, "step": 27000 }, { "epoch": 12.732201791607732, "grad_norm": 0.43118417263031006, "learning_rate": 1.755842011251779e-05, "loss": 0.0732, "num_input_tokens_seen": 27171360, "step": 27005 }, { "epoch": 12.734559170202735, "grad_norm": 1.6256301403045654, "learning_rate": 1.7548600992930158e-05, "loss": 0.1955, "num_input_tokens_seen": 27176096, "step": 27010 }, { "epoch": 12.736916548797737, "grad_norm": 2.041579008102417, "learning_rate": 1.753878313473786e-05, "loss": 0.1607, "num_input_tokens_seen": 27180576, "step": 27015 }, { "epoch": 12.73927392739274, "grad_norm": 0.10879157483577728, "learning_rate": 1.7528966539602886e-05, "loss": 0.0227, "num_input_tokens_seen": 27185440, "step": 27020 }, { "epoch": 12.741631305987742, "grad_norm": 0.0785142332315445, "learning_rate": 1.7519151209187037e-05, "loss": 0.1863, "num_input_tokens_seen": 27190784, "step": 27025 }, { "epoch": 12.743988684582744, "grad_norm": 0.3009236752986908, "learning_rate": 1.7509337145151865e-05, "loss": 0.0878, "num_input_tokens_seen": 27196544, "step": 27030 }, { "epoch": 12.746346063177747, "grad_norm": 0.3967486619949341, "learning_rate": 1.749952434915873e-05, "loss": 0.0907, "num_input_tokens_seen": 27201248, "step": 27035 }, { "epoch": 12.748703441772749, "grad_norm": 1.7418440580368042, "learning_rate": 1.7489712822868773e-05, "loss": 0.2218, "num_input_tokens_seen": 27205920, "step": 27040 }, { "epoch": 12.751060820367751, "grad_norm": 1.082627773284912, "learning_rate": 1.747990256794292e-05, "loss": 0.0563, "num_input_tokens_seen": 27210880, "step": 27045 }, { "epoch": 12.753418198962754, "grad_norm": 0.7207107543945312, "learning_rate": 1.7470093586041874e-05, "loss": 0.0722, "num_input_tokens_seen": 27215584, "step": 27050 }, { "epoch": 12.755775577557756, "grad_norm": 0.3992519676685333, "learning_rate": 1.7460285878826138e-05, "loss": 0.1195, "num_input_tokens_seen": 27220768, "step": 27055 }, { "epoch": 12.758132956152759, "grad_norm": 0.07866324484348297, "learning_rate": 1.745047944795598e-05, "loss": 0.1528, "num_input_tokens_seen": 27225696, "step": 27060 }, { "epoch": 12.760490334747761, "grad_norm": 0.04440399259328842, "learning_rate": 1.7440674295091476e-05, "loss": 0.0545, "num_input_tokens_seen": 27230848, "step": 27065 }, { "epoch": 12.762847713342763, "grad_norm": 1.890353798866272, "learning_rate": 1.7430870421892458e-05, "loss": 0.0859, "num_input_tokens_seen": 27236096, "step": 27070 }, { "epoch": 12.765205091937766, "grad_norm": 0.08899611979722977, "learning_rate": 1.742106783001856e-05, "loss": 0.0093, "num_input_tokens_seen": 27241600, "step": 27075 }, { "epoch": 12.767562470532768, "grad_norm": 0.2849535644054413, "learning_rate": 1.7411266521129195e-05, "loss": 0.0353, "num_input_tokens_seen": 27246560, "step": 27080 }, { "epoch": 12.76991984912777, "grad_norm": 0.2233775556087494, "learning_rate": 1.7401466496883567e-05, "loss": 0.1825, "num_input_tokens_seen": 27252896, "step": 27085 }, { "epoch": 12.772277227722773, "grad_norm": 0.46544215083122253, "learning_rate": 1.7391667758940638e-05, "loss": 0.0601, "num_input_tokens_seen": 27260256, "step": 27090 }, { "epoch": 12.774634606317775, "grad_norm": 0.7020683288574219, "learning_rate": 1.7381870308959176e-05, "loss": 0.0639, "num_input_tokens_seen": 27265824, "step": 27095 }, { "epoch": 12.776991984912778, "grad_norm": 0.134294793009758, "learning_rate": 1.737207414859772e-05, "loss": 0.0748, "num_input_tokens_seen": 27271712, "step": 27100 }, { "epoch": 12.77934936350778, "grad_norm": 0.04574476182460785, "learning_rate": 1.7362279279514605e-05, "loss": 0.142, "num_input_tokens_seen": 27276448, "step": 27105 }, { "epoch": 12.78170674210278, "grad_norm": 0.09034696221351624, "learning_rate": 1.7352485703367915e-05, "loss": 0.0496, "num_input_tokens_seen": 27281760, "step": 27110 }, { "epoch": 12.784064120697785, "grad_norm": 0.23322629928588867, "learning_rate": 1.7342693421815554e-05, "loss": 0.1635, "num_input_tokens_seen": 27288384, "step": 27115 }, { "epoch": 12.786421499292786, "grad_norm": 0.29693353176116943, "learning_rate": 1.733290243651518e-05, "loss": 0.0125, "num_input_tokens_seen": 27292416, "step": 27120 }, { "epoch": 12.788778877887788, "grad_norm": 0.34899431467056274, "learning_rate": 1.7323112749124253e-05, "loss": 0.1093, "num_input_tokens_seen": 27296992, "step": 27125 }, { "epoch": 12.79113625648279, "grad_norm": 0.04499302804470062, "learning_rate": 1.731332436129998e-05, "loss": 0.0352, "num_input_tokens_seen": 27302016, "step": 27130 }, { "epoch": 12.793493635077793, "grad_norm": 1.0688313245773315, "learning_rate": 1.7303537274699384e-05, "loss": 0.2661, "num_input_tokens_seen": 27305760, "step": 27135 }, { "epoch": 12.795851013672795, "grad_norm": 1.1279813051223755, "learning_rate": 1.729375149097925e-05, "loss": 0.1877, "num_input_tokens_seen": 27311264, "step": 27140 }, { "epoch": 12.798208392267798, "grad_norm": 2.5426111221313477, "learning_rate": 1.728396701179615e-05, "loss": 0.1656, "num_input_tokens_seen": 27316160, "step": 27145 }, { "epoch": 12.8005657708628, "grad_norm": 1.1835235357284546, "learning_rate": 1.7274183838806412e-05, "loss": 0.1456, "num_input_tokens_seen": 27321248, "step": 27150 }, { "epoch": 12.802923149457802, "grad_norm": 0.8848495483398438, "learning_rate": 1.726440197366618e-05, "loss": 0.1675, "num_input_tokens_seen": 27325856, "step": 27155 }, { "epoch": 12.805280528052805, "grad_norm": 1.4375792741775513, "learning_rate": 1.7254621418031346e-05, "loss": 0.2316, "num_input_tokens_seen": 27331296, "step": 27160 }, { "epoch": 12.807637906647807, "grad_norm": 0.6317828893661499, "learning_rate": 1.7244842173557606e-05, "loss": 0.0987, "num_input_tokens_seen": 27335712, "step": 27165 }, { "epoch": 12.80999528524281, "grad_norm": 0.20205578207969666, "learning_rate": 1.7235064241900408e-05, "loss": 0.2136, "num_input_tokens_seen": 27341024, "step": 27170 }, { "epoch": 12.812352663837812, "grad_norm": 1.3771222829818726, "learning_rate": 1.7225287624714985e-05, "loss": 0.1962, "num_input_tokens_seen": 27345536, "step": 27175 }, { "epoch": 12.814710042432814, "grad_norm": 0.22700606286525726, "learning_rate": 1.7215512323656362e-05, "loss": 0.2587, "num_input_tokens_seen": 27350912, "step": 27180 }, { "epoch": 12.817067421027817, "grad_norm": 0.1624482423067093, "learning_rate": 1.7205738340379347e-05, "loss": 0.085, "num_input_tokens_seen": 27356256, "step": 27185 }, { "epoch": 12.81942479962282, "grad_norm": 1.3640388250350952, "learning_rate": 1.7195965676538476e-05, "loss": 0.1389, "num_input_tokens_seen": 27361088, "step": 27190 }, { "epoch": 12.821782178217822, "grad_norm": 0.14057625830173492, "learning_rate": 1.7186194333788114e-05, "loss": 0.0486, "num_input_tokens_seen": 27365376, "step": 27195 }, { "epoch": 12.824139556812824, "grad_norm": 0.3381286859512329, "learning_rate": 1.717642431378238e-05, "loss": 0.2353, "num_input_tokens_seen": 27369824, "step": 27200 }, { "epoch": 12.826496935407826, "grad_norm": 1.7533892393112183, "learning_rate": 1.716665561817517e-05, "loss": 0.2298, "num_input_tokens_seen": 27373920, "step": 27205 }, { "epoch": 12.828854314002829, "grad_norm": 0.5892969965934753, "learning_rate": 1.715688824862016e-05, "loss": 0.1249, "num_input_tokens_seen": 27379648, "step": 27210 }, { "epoch": 12.831211692597831, "grad_norm": 0.05433589220046997, "learning_rate": 1.714712220677081e-05, "loss": 0.0492, "num_input_tokens_seen": 27384800, "step": 27215 }, { "epoch": 12.833569071192834, "grad_norm": 0.010151957161724567, "learning_rate": 1.7137357494280332e-05, "loss": 0.258, "num_input_tokens_seen": 27390528, "step": 27220 }, { "epoch": 12.835926449787836, "grad_norm": 0.080060675740242, "learning_rate": 1.7127594112801724e-05, "loss": 0.1837, "num_input_tokens_seen": 27395136, "step": 27225 }, { "epoch": 12.838283828382838, "grad_norm": 0.39468106627464294, "learning_rate": 1.711783206398777e-05, "loss": 0.0784, "num_input_tokens_seen": 27399872, "step": 27230 }, { "epoch": 12.84064120697784, "grad_norm": 0.6433387398719788, "learning_rate": 1.7108071349491022e-05, "loss": 0.0581, "num_input_tokens_seen": 27404928, "step": 27235 }, { "epoch": 12.842998585572843, "grad_norm": 0.20208287239074707, "learning_rate": 1.7098311970963793e-05, "loss": 0.0439, "num_input_tokens_seen": 27409984, "step": 27240 }, { "epoch": 12.845355964167846, "grad_norm": 0.18164458870887756, "learning_rate": 1.708855393005818e-05, "loss": 0.1297, "num_input_tokens_seen": 27415200, "step": 27245 }, { "epoch": 12.847713342762848, "grad_norm": 0.5703117847442627, "learning_rate": 1.7078797228426065e-05, "loss": 0.236, "num_input_tokens_seen": 27420000, "step": 27250 }, { "epoch": 12.85007072135785, "grad_norm": 0.07350876927375793, "learning_rate": 1.7069041867719085e-05, "loss": 0.1015, "num_input_tokens_seen": 27424480, "step": 27255 }, { "epoch": 12.852428099952853, "grad_norm": 1.0234124660491943, "learning_rate": 1.705928784958865e-05, "loss": 0.1061, "num_input_tokens_seen": 27429728, "step": 27260 }, { "epoch": 12.854785478547855, "grad_norm": 0.12659509479999542, "learning_rate": 1.704953517568596e-05, "loss": 0.0357, "num_input_tokens_seen": 27434080, "step": 27265 }, { "epoch": 12.857142857142858, "grad_norm": 0.4493967294692993, "learning_rate": 1.7039783847661973e-05, "loss": 0.2193, "num_input_tokens_seen": 27439616, "step": 27270 }, { "epoch": 12.85950023573786, "grad_norm": 0.1490928828716278, "learning_rate": 1.703003386716743e-05, "loss": 0.0445, "num_input_tokens_seen": 27445056, "step": 27275 }, { "epoch": 12.861857614332862, "grad_norm": 0.9413321614265442, "learning_rate": 1.7020285235852824e-05, "loss": 0.0505, "num_input_tokens_seen": 27450176, "step": 27280 }, { "epoch": 12.864214992927865, "grad_norm": 0.057050615549087524, "learning_rate": 1.7010537955368438e-05, "loss": 0.0705, "num_input_tokens_seen": 27455264, "step": 27285 }, { "epoch": 12.866572371522867, "grad_norm": 0.2628794014453888, "learning_rate": 1.7000792027364327e-05, "loss": 0.1085, "num_input_tokens_seen": 27459104, "step": 27290 }, { "epoch": 12.86892975011787, "grad_norm": 1.167077898979187, "learning_rate": 1.6991047453490312e-05, "loss": 0.1372, "num_input_tokens_seen": 27463936, "step": 27295 }, { "epoch": 12.871287128712872, "grad_norm": 0.15775015950202942, "learning_rate": 1.698130423539597e-05, "loss": 0.0875, "num_input_tokens_seen": 27468320, "step": 27300 }, { "epoch": 12.873644507307874, "grad_norm": 0.19752182066440582, "learning_rate": 1.697156237473067e-05, "loss": 0.1172, "num_input_tokens_seen": 27472544, "step": 27305 }, { "epoch": 12.876001885902877, "grad_norm": 0.2524728775024414, "learning_rate": 1.6961821873143553e-05, "loss": 0.0395, "num_input_tokens_seen": 27477408, "step": 27310 }, { "epoch": 12.878359264497877, "grad_norm": 0.9352668523788452, "learning_rate": 1.6952082732283507e-05, "loss": 0.2438, "num_input_tokens_seen": 27482240, "step": 27315 }, { "epoch": 12.88071664309288, "grad_norm": 1.0456651449203491, "learning_rate": 1.694234495379921e-05, "loss": 0.3395, "num_input_tokens_seen": 27486752, "step": 27320 }, { "epoch": 12.883074021687882, "grad_norm": 0.4823342561721802, "learning_rate": 1.69326085393391e-05, "loss": 0.0494, "num_input_tokens_seen": 27491584, "step": 27325 }, { "epoch": 12.885431400282885, "grad_norm": 2.5104942321777344, "learning_rate": 1.692287349055139e-05, "loss": 0.2484, "num_input_tokens_seen": 27496608, "step": 27330 }, { "epoch": 12.887788778877887, "grad_norm": 0.4930325448513031, "learning_rate": 1.6913139809084054e-05, "loss": 0.1434, "num_input_tokens_seen": 27501632, "step": 27335 }, { "epoch": 12.89014615747289, "grad_norm": 0.29810062050819397, "learning_rate": 1.6903407496584843e-05, "loss": 0.0232, "num_input_tokens_seen": 27506784, "step": 27340 }, { "epoch": 12.892503536067892, "grad_norm": 0.13758181035518646, "learning_rate": 1.6893676554701272e-05, "loss": 0.0855, "num_input_tokens_seen": 27511744, "step": 27345 }, { "epoch": 12.894860914662894, "grad_norm": 1.4611835479736328, "learning_rate": 1.6883946985080617e-05, "loss": 0.0899, "num_input_tokens_seen": 27517952, "step": 27350 }, { "epoch": 12.897218293257897, "grad_norm": 0.06107639893889427, "learning_rate": 1.687421878936994e-05, "loss": 0.0524, "num_input_tokens_seen": 27522432, "step": 27355 }, { "epoch": 12.899575671852899, "grad_norm": 0.6477144360542297, "learning_rate": 1.6864491969216054e-05, "loss": 0.1111, "num_input_tokens_seen": 27526688, "step": 27360 }, { "epoch": 12.901933050447902, "grad_norm": 1.0745859146118164, "learning_rate": 1.685476652626554e-05, "loss": 0.1915, "num_input_tokens_seen": 27531424, "step": 27365 }, { "epoch": 12.904290429042904, "grad_norm": 0.4775439500808716, "learning_rate": 1.6845042462164758e-05, "loss": 0.1885, "num_input_tokens_seen": 27536928, "step": 27370 }, { "epoch": 12.906647807637906, "grad_norm": 1.3874742984771729, "learning_rate": 1.6835319778559827e-05, "loss": 0.2026, "num_input_tokens_seen": 27542592, "step": 27375 }, { "epoch": 12.909005186232909, "grad_norm": 0.04082968831062317, "learning_rate": 1.6825598477096625e-05, "loss": 0.0257, "num_input_tokens_seen": 27547712, "step": 27380 }, { "epoch": 12.911362564827911, "grad_norm": 1.587715744972229, "learning_rate": 1.6815878559420805e-05, "loss": 0.0958, "num_input_tokens_seen": 27552128, "step": 27385 }, { "epoch": 12.913719943422914, "grad_norm": 0.07970046252012253, "learning_rate": 1.680616002717779e-05, "loss": 0.0346, "num_input_tokens_seen": 27557184, "step": 27390 }, { "epoch": 12.916077322017916, "grad_norm": 0.5230069160461426, "learning_rate": 1.679644288201276e-05, "loss": 0.0392, "num_input_tokens_seen": 27562464, "step": 27395 }, { "epoch": 12.918434700612918, "grad_norm": 1.0042794942855835, "learning_rate": 1.6786727125570663e-05, "loss": 0.209, "num_input_tokens_seen": 27567680, "step": 27400 }, { "epoch": 12.92079207920792, "grad_norm": 0.7284773588180542, "learning_rate": 1.6777012759496204e-05, "loss": 0.0739, "num_input_tokens_seen": 27573792, "step": 27405 }, { "epoch": 12.923149457802923, "grad_norm": 1.7722752094268799, "learning_rate": 1.6767299785433877e-05, "loss": 0.1299, "num_input_tokens_seen": 27580512, "step": 27410 }, { "epoch": 12.925506836397926, "grad_norm": 0.026179324835538864, "learning_rate": 1.6757588205027907e-05, "loss": 0.0668, "num_input_tokens_seen": 27585504, "step": 27415 }, { "epoch": 12.927864214992928, "grad_norm": 0.48940590023994446, "learning_rate": 1.674787801992231e-05, "loss": 0.0589, "num_input_tokens_seen": 27590464, "step": 27420 }, { "epoch": 12.93022159358793, "grad_norm": 0.059122517704963684, "learning_rate": 1.6738169231760842e-05, "loss": 0.0093, "num_input_tokens_seen": 27596000, "step": 27425 }, { "epoch": 12.932578972182933, "grad_norm": 1.6558887958526611, "learning_rate": 1.672846184218705e-05, "loss": 0.0926, "num_input_tokens_seen": 27601280, "step": 27430 }, { "epoch": 12.934936350777935, "grad_norm": 0.13447244465351105, "learning_rate": 1.671875585284422e-05, "loss": 0.1627, "num_input_tokens_seen": 27606144, "step": 27435 }, { "epoch": 12.937293729372938, "grad_norm": 0.08221117407083511, "learning_rate": 1.6709051265375424e-05, "loss": 0.2003, "num_input_tokens_seen": 27610592, "step": 27440 }, { "epoch": 12.93965110796794, "grad_norm": 0.4325973391532898, "learning_rate": 1.669934808142347e-05, "loss": 0.1029, "num_input_tokens_seen": 27615328, "step": 27445 }, { "epoch": 12.942008486562942, "grad_norm": 0.1195387989282608, "learning_rate": 1.668964630263095e-05, "loss": 0.0537, "num_input_tokens_seen": 27620032, "step": 27450 }, { "epoch": 12.944365865157945, "grad_norm": 1.0860337018966675, "learning_rate": 1.667994593064021e-05, "loss": 0.1227, "num_input_tokens_seen": 27625504, "step": 27455 }, { "epoch": 12.946723243752947, "grad_norm": 0.026383979246020317, "learning_rate": 1.6670246967093367e-05, "loss": 0.1924, "num_input_tokens_seen": 27629440, "step": 27460 }, { "epoch": 12.94908062234795, "grad_norm": 0.8595477342605591, "learning_rate": 1.666054941363227e-05, "loss": 0.118, "num_input_tokens_seen": 27634528, "step": 27465 }, { "epoch": 12.951438000942952, "grad_norm": 0.504947304725647, "learning_rate": 1.6650853271898564e-05, "loss": 0.0401, "num_input_tokens_seen": 27639328, "step": 27470 }, { "epoch": 12.953795379537954, "grad_norm": 1.402589201927185, "learning_rate": 1.6641158543533637e-05, "loss": 0.2022, "num_input_tokens_seen": 27644224, "step": 27475 }, { "epoch": 12.956152758132957, "grad_norm": 0.9974941611289978, "learning_rate": 1.663146523017865e-05, "loss": 0.0758, "num_input_tokens_seen": 27649376, "step": 27480 }, { "epoch": 12.95851013672796, "grad_norm": 2.4958386421203613, "learning_rate": 1.6621773333474507e-05, "loss": 0.1526, "num_input_tokens_seen": 27654720, "step": 27485 }, { "epoch": 12.960867515322962, "grad_norm": 0.6049426794052124, "learning_rate": 1.6612082855061878e-05, "loss": 0.1885, "num_input_tokens_seen": 27659520, "step": 27490 }, { "epoch": 12.963224893917964, "grad_norm": 1.1593130826950073, "learning_rate": 1.6602393796581205e-05, "loss": 0.1171, "num_input_tokens_seen": 27664128, "step": 27495 }, { "epoch": 12.965582272512966, "grad_norm": 0.21252083778381348, "learning_rate": 1.6592706159672688e-05, "loss": 0.2103, "num_input_tokens_seen": 27669408, "step": 27500 }, { "epoch": 12.967939651107969, "grad_norm": 0.0980028361082077, "learning_rate": 1.6583019945976264e-05, "loss": 0.0321, "num_input_tokens_seen": 27673600, "step": 27505 }, { "epoch": 12.97029702970297, "grad_norm": 0.6969453692436218, "learning_rate": 1.6573335157131647e-05, "loss": 0.047, "num_input_tokens_seen": 27678080, "step": 27510 }, { "epoch": 12.972654408297974, "grad_norm": 0.13860012590885162, "learning_rate": 1.6563651794778317e-05, "loss": 0.1832, "num_input_tokens_seen": 27683200, "step": 27515 }, { "epoch": 12.975011786892974, "grad_norm": 0.8803449869155884, "learning_rate": 1.65539698605555e-05, "loss": 0.0805, "num_input_tokens_seen": 27688256, "step": 27520 }, { "epoch": 12.977369165487977, "grad_norm": 0.3381086587905884, "learning_rate": 1.6544289356102172e-05, "loss": 0.0862, "num_input_tokens_seen": 27693984, "step": 27525 }, { "epoch": 12.979726544082979, "grad_norm": 0.12190450727939606, "learning_rate": 1.653461028305709e-05, "loss": 0.2051, "num_input_tokens_seen": 27698656, "step": 27530 }, { "epoch": 12.982083922677981, "grad_norm": 0.12219392508268356, "learning_rate": 1.6524932643058753e-05, "loss": 0.1017, "num_input_tokens_seen": 27702880, "step": 27535 }, { "epoch": 12.984441301272984, "grad_norm": 0.03613370656967163, "learning_rate": 1.6515256437745425e-05, "loss": 0.0374, "num_input_tokens_seen": 27707616, "step": 27540 }, { "epoch": 12.986798679867986, "grad_norm": 2.048656940460205, "learning_rate": 1.6505581668755112e-05, "loss": 0.1796, "num_input_tokens_seen": 27712288, "step": 27545 }, { "epoch": 12.989156058462989, "grad_norm": 1.4310600757598877, "learning_rate": 1.64959083377256e-05, "loss": 0.1706, "num_input_tokens_seen": 27716928, "step": 27550 }, { "epoch": 12.991513437057991, "grad_norm": 0.7848905324935913, "learning_rate": 1.6486236446294413e-05, "loss": 0.0309, "num_input_tokens_seen": 27721152, "step": 27555 }, { "epoch": 12.993870815652993, "grad_norm": 0.4792338013648987, "learning_rate": 1.6476565996098845e-05, "loss": 0.041, "num_input_tokens_seen": 27727168, "step": 27560 }, { "epoch": 12.996228194247996, "grad_norm": 0.05067487433552742, "learning_rate": 1.6466896988775927e-05, "loss": 0.0089, "num_input_tokens_seen": 27731968, "step": 27565 }, { "epoch": 12.998585572842998, "grad_norm": 0.8003635406494141, "learning_rate": 1.6457229425962466e-05, "loss": 0.0795, "num_input_tokens_seen": 27736064, "step": 27570 }, { "epoch": 13.0, "eval_loss": 0.1524803191423416, "eval_runtime": 15.0482, "eval_samples_per_second": 62.665, "eval_steps_per_second": 15.683, "num_input_tokens_seen": 27739072, "step": 27573 }, { "epoch": 13.000942951438, "grad_norm": 0.8757936358451843, "learning_rate": 1.6447563309295016e-05, "loss": 0.1305, "num_input_tokens_seen": 27741632, "step": 27575 }, { "epoch": 13.003300330033003, "grad_norm": 0.04936086758971214, "learning_rate": 1.6437898640409894e-05, "loss": 0.0888, "num_input_tokens_seen": 27747392, "step": 27580 }, { "epoch": 13.005657708628005, "grad_norm": 0.03834601491689682, "learning_rate": 1.6428235420943146e-05, "loss": 0.1499, "num_input_tokens_seen": 27751808, "step": 27585 }, { "epoch": 13.008015087223008, "grad_norm": 1.2606784105300903, "learning_rate": 1.6418573652530606e-05, "loss": 0.0521, "num_input_tokens_seen": 27756448, "step": 27590 }, { "epoch": 13.01037246581801, "grad_norm": 0.026112930849194527, "learning_rate": 1.6408913336807835e-05, "loss": 0.0961, "num_input_tokens_seen": 27761216, "step": 27595 }, { "epoch": 13.012729844413013, "grad_norm": 0.0958615243434906, "learning_rate": 1.639925447541018e-05, "loss": 0.1868, "num_input_tokens_seen": 27765120, "step": 27600 }, { "epoch": 13.015087223008015, "grad_norm": 0.2082289606332779, "learning_rate": 1.6389597069972702e-05, "loss": 0.0868, "num_input_tokens_seen": 27770336, "step": 27605 }, { "epoch": 13.017444601603017, "grad_norm": 1.2170392274856567, "learning_rate": 1.6379941122130243e-05, "loss": 0.1219, "num_input_tokens_seen": 27775104, "step": 27610 }, { "epoch": 13.01980198019802, "grad_norm": 0.030542684718966484, "learning_rate": 1.637028663351739e-05, "loss": 0.0498, "num_input_tokens_seen": 27779872, "step": 27615 }, { "epoch": 13.022159358793022, "grad_norm": 0.14337970316410065, "learning_rate": 1.6360633605768495e-05, "loss": 0.0262, "num_input_tokens_seen": 27783584, "step": 27620 }, { "epoch": 13.024516737388025, "grad_norm": 0.3973034918308258, "learning_rate": 1.635098204051764e-05, "loss": 0.0574, "num_input_tokens_seen": 27787712, "step": 27625 }, { "epoch": 13.026874115983027, "grad_norm": 1.4248343706130981, "learning_rate": 1.634133193939866e-05, "loss": 0.197, "num_input_tokens_seen": 27792352, "step": 27630 }, { "epoch": 13.02923149457803, "grad_norm": 0.605689525604248, "learning_rate": 1.6331683304045192e-05, "loss": 0.128, "num_input_tokens_seen": 27797472, "step": 27635 }, { "epoch": 13.031588873173032, "grad_norm": 2.0109705924987793, "learning_rate": 1.6322036136090547e-05, "loss": 0.1246, "num_input_tokens_seen": 27802848, "step": 27640 }, { "epoch": 13.033946251768034, "grad_norm": 0.8816770315170288, "learning_rate": 1.6312390437167835e-05, "loss": 0.1853, "num_input_tokens_seen": 27807968, "step": 27645 }, { "epoch": 13.036303630363037, "grad_norm": 0.8304774761199951, "learning_rate": 1.6302746208909924e-05, "loss": 0.201, "num_input_tokens_seen": 27813088, "step": 27650 }, { "epoch": 13.038661008958039, "grad_norm": 0.40817227959632874, "learning_rate": 1.6293103452949405e-05, "loss": 0.0723, "num_input_tokens_seen": 27818272, "step": 27655 }, { "epoch": 13.041018387553041, "grad_norm": 0.18676789104938507, "learning_rate": 1.6283462170918635e-05, "loss": 0.0823, "num_input_tokens_seen": 27822528, "step": 27660 }, { "epoch": 13.043375766148044, "grad_norm": 0.8504906296730042, "learning_rate": 1.627382236444972e-05, "loss": 0.0576, "num_input_tokens_seen": 27828704, "step": 27665 }, { "epoch": 13.045733144743046, "grad_norm": 0.6242481470108032, "learning_rate": 1.6264184035174523e-05, "loss": 0.0976, "num_input_tokens_seen": 27833920, "step": 27670 }, { "epoch": 13.048090523338049, "grad_norm": 0.4479981064796448, "learning_rate": 1.625454718472464e-05, "loss": 0.0719, "num_input_tokens_seen": 27838624, "step": 27675 }, { "epoch": 13.050447901933051, "grad_norm": 0.06726183742284775, "learning_rate": 1.624491181473142e-05, "loss": 0.1779, "num_input_tokens_seen": 27844864, "step": 27680 }, { "epoch": 13.052805280528053, "grad_norm": 0.10321830958127975, "learning_rate": 1.6235277926825986e-05, "loss": 0.0489, "num_input_tokens_seen": 27849312, "step": 27685 }, { "epoch": 13.055162659123056, "grad_norm": 0.43402808904647827, "learning_rate": 1.622564552263919e-05, "loss": 0.0836, "num_input_tokens_seen": 27854016, "step": 27690 }, { "epoch": 13.057520037718058, "grad_norm": 0.10345843434333801, "learning_rate": 1.6216014603801616e-05, "loss": 0.0712, "num_input_tokens_seen": 27858880, "step": 27695 }, { "epoch": 13.05987741631306, "grad_norm": 2.64408016204834, "learning_rate": 1.6206385171943626e-05, "loss": 0.1705, "num_input_tokens_seen": 27864320, "step": 27700 }, { "epoch": 13.062234794908063, "grad_norm": 1.224687933921814, "learning_rate": 1.619675722869532e-05, "loss": 0.1502, "num_input_tokens_seen": 27870144, "step": 27705 }, { "epoch": 13.064592173503065, "grad_norm": 1.6598618030548096, "learning_rate": 1.6187130775686554e-05, "loss": 0.0798, "num_input_tokens_seen": 27874624, "step": 27710 }, { "epoch": 13.066949552098066, "grad_norm": 0.5231867432594299, "learning_rate": 1.6177505814546902e-05, "loss": 0.05, "num_input_tokens_seen": 27879840, "step": 27715 }, { "epoch": 13.069306930693068, "grad_norm": 0.036052584648132324, "learning_rate": 1.616788234690572e-05, "loss": 0.2073, "num_input_tokens_seen": 27884576, "step": 27720 }, { "epoch": 13.07166430928807, "grad_norm": 0.0999307706952095, "learning_rate": 1.6158260374392098e-05, "loss": 0.0986, "num_input_tokens_seen": 27888416, "step": 27725 }, { "epoch": 13.074021687883073, "grad_norm": 1.6560033559799194, "learning_rate": 1.6148639898634875e-05, "loss": 0.1681, "num_input_tokens_seen": 27893856, "step": 27730 }, { "epoch": 13.076379066478076, "grad_norm": 0.4318532645702362, "learning_rate": 1.6139020921262626e-05, "loss": 0.0213, "num_input_tokens_seen": 27898464, "step": 27735 }, { "epoch": 13.078736445073078, "grad_norm": 0.8072445392608643, "learning_rate": 1.6129403443903683e-05, "loss": 0.0984, "num_input_tokens_seen": 27904224, "step": 27740 }, { "epoch": 13.08109382366808, "grad_norm": 0.08122153580188751, "learning_rate": 1.611978746818612e-05, "loss": 0.0889, "num_input_tokens_seen": 27909120, "step": 27745 }, { "epoch": 13.083451202263083, "grad_norm": 0.8682464957237244, "learning_rate": 1.611017299573777e-05, "loss": 0.042, "num_input_tokens_seen": 27914368, "step": 27750 }, { "epoch": 13.085808580858085, "grad_norm": 0.2462792694568634, "learning_rate": 1.6100560028186184e-05, "loss": 0.04, "num_input_tokens_seen": 27918784, "step": 27755 }, { "epoch": 13.088165959453088, "grad_norm": 2.5813751220703125, "learning_rate": 1.6090948567158682e-05, "loss": 0.2318, "num_input_tokens_seen": 27923904, "step": 27760 }, { "epoch": 13.09052333804809, "grad_norm": 0.3317680358886719, "learning_rate": 1.6081338614282322e-05, "loss": 0.0295, "num_input_tokens_seen": 27928352, "step": 27765 }, { "epoch": 13.092880716643092, "grad_norm": 0.19630296528339386, "learning_rate": 1.6071730171183908e-05, "loss": 0.0292, "num_input_tokens_seen": 27932960, "step": 27770 }, { "epoch": 13.095238095238095, "grad_norm": 1.307132363319397, "learning_rate": 1.6062123239489973e-05, "loss": 0.2908, "num_input_tokens_seen": 27937568, "step": 27775 }, { "epoch": 13.097595473833097, "grad_norm": 0.0966794341802597, "learning_rate": 1.6052517820826824e-05, "loss": 0.136, "num_input_tokens_seen": 27942944, "step": 27780 }, { "epoch": 13.0999528524281, "grad_norm": 0.05335894972085953, "learning_rate": 1.6042913916820484e-05, "loss": 0.0244, "num_input_tokens_seen": 27947552, "step": 27785 }, { "epoch": 13.102310231023102, "grad_norm": 1.238787293434143, "learning_rate": 1.6033311529096743e-05, "loss": 0.1135, "num_input_tokens_seen": 27952448, "step": 27790 }, { "epoch": 13.104667609618105, "grad_norm": 0.10234064608812332, "learning_rate": 1.602371065928111e-05, "loss": 0.0655, "num_input_tokens_seen": 27958080, "step": 27795 }, { "epoch": 13.107024988213107, "grad_norm": 1.0315245389938354, "learning_rate": 1.601411130899885e-05, "loss": 0.1675, "num_input_tokens_seen": 27962912, "step": 27800 }, { "epoch": 13.10938236680811, "grad_norm": 0.11634054034948349, "learning_rate": 1.6004513479874974e-05, "loss": 0.0574, "num_input_tokens_seen": 27968544, "step": 27805 }, { "epoch": 13.111739745403112, "grad_norm": 0.6141372323036194, "learning_rate": 1.599491717353424e-05, "loss": 0.0793, "num_input_tokens_seen": 27974816, "step": 27810 }, { "epoch": 13.114097123998114, "grad_norm": 0.44787049293518066, "learning_rate": 1.5985322391601124e-05, "loss": 0.0814, "num_input_tokens_seen": 27979104, "step": 27815 }, { "epoch": 13.116454502593117, "grad_norm": 0.12848111987113953, "learning_rate": 1.5975729135699865e-05, "loss": 0.0488, "num_input_tokens_seen": 27983136, "step": 27820 }, { "epoch": 13.118811881188119, "grad_norm": 0.09240169078111649, "learning_rate": 1.596613740745444e-05, "loss": 0.0378, "num_input_tokens_seen": 27988640, "step": 27825 }, { "epoch": 13.121169259783121, "grad_norm": 0.2759908139705658, "learning_rate": 1.5956547208488572e-05, "loss": 0.0393, "num_input_tokens_seen": 27993984, "step": 27830 }, { "epoch": 13.123526638378124, "grad_norm": 0.9823769330978394, "learning_rate": 1.594695854042571e-05, "loss": 0.1196, "num_input_tokens_seen": 27998624, "step": 27835 }, { "epoch": 13.125884016973126, "grad_norm": 0.19158200919628143, "learning_rate": 1.5937371404889052e-05, "loss": 0.1115, "num_input_tokens_seen": 28003168, "step": 27840 }, { "epoch": 13.128241395568129, "grad_norm": 1.6193667650222778, "learning_rate": 1.592778580350154e-05, "loss": 0.0427, "num_input_tokens_seen": 28007584, "step": 27845 }, { "epoch": 13.130598774163131, "grad_norm": 1.414949655532837, "learning_rate": 1.5918201737885863e-05, "loss": 0.1095, "num_input_tokens_seen": 28013216, "step": 27850 }, { "epoch": 13.132956152758133, "grad_norm": 2.0345349311828613, "learning_rate": 1.5908619209664423e-05, "loss": 0.254, "num_input_tokens_seen": 28017824, "step": 27855 }, { "epoch": 13.135313531353136, "grad_norm": 0.15975934267044067, "learning_rate": 1.58990382204594e-05, "loss": 0.0356, "num_input_tokens_seen": 28022976, "step": 27860 }, { "epoch": 13.137670909948138, "grad_norm": 1.7711145877838135, "learning_rate": 1.5889458771892667e-05, "loss": 0.1085, "num_input_tokens_seen": 28027552, "step": 27865 }, { "epoch": 13.14002828854314, "grad_norm": 1.5504556894302368, "learning_rate": 1.5879880865585878e-05, "loss": 0.153, "num_input_tokens_seen": 28031872, "step": 27870 }, { "epoch": 13.142385667138143, "grad_norm": 0.7581167221069336, "learning_rate": 1.5870304503160407e-05, "loss": 0.0704, "num_input_tokens_seen": 28037152, "step": 27875 }, { "epoch": 13.144743045733145, "grad_norm": 0.6650431752204895, "learning_rate": 1.5860729686237378e-05, "loss": 0.0815, "num_input_tokens_seen": 28042912, "step": 27880 }, { "epoch": 13.147100424328148, "grad_norm": 3.3127222061157227, "learning_rate": 1.585115641643763e-05, "loss": 0.094, "num_input_tokens_seen": 28048928, "step": 27885 }, { "epoch": 13.14945780292315, "grad_norm": 0.33658862113952637, "learning_rate": 1.584158469538176e-05, "loss": 0.0284, "num_input_tokens_seen": 28054112, "step": 27890 }, { "epoch": 13.151815181518153, "grad_norm": 1.1047842502593994, "learning_rate": 1.5832014524690102e-05, "loss": 0.2265, "num_input_tokens_seen": 28059488, "step": 27895 }, { "epoch": 13.154172560113155, "grad_norm": 0.7988759875297546, "learning_rate": 1.582244590598273e-05, "loss": 0.0527, "num_input_tokens_seen": 28065056, "step": 27900 }, { "epoch": 13.156529938708157, "grad_norm": 1.4017081260681152, "learning_rate": 1.581287884087943e-05, "loss": 0.3188, "num_input_tokens_seen": 28071584, "step": 27905 }, { "epoch": 13.15888731730316, "grad_norm": 0.029975472018122673, "learning_rate": 1.5803313330999758e-05, "loss": 0.0499, "num_input_tokens_seen": 28075968, "step": 27910 }, { "epoch": 13.16124469589816, "grad_norm": 0.39266565442085266, "learning_rate": 1.5793749377962986e-05, "loss": 0.052, "num_input_tokens_seen": 28080992, "step": 27915 }, { "epoch": 13.163602074493163, "grad_norm": 0.31110233068466187, "learning_rate": 1.5784186983388134e-05, "loss": 0.03, "num_input_tokens_seen": 28085280, "step": 27920 }, { "epoch": 13.165959453088165, "grad_norm": 0.19048413634300232, "learning_rate": 1.577462614889395e-05, "loss": 0.2594, "num_input_tokens_seen": 28090816, "step": 27925 }, { "epoch": 13.168316831683168, "grad_norm": 0.135964035987854, "learning_rate": 1.5765066876098916e-05, "loss": 0.0496, "num_input_tokens_seen": 28095168, "step": 27930 }, { "epoch": 13.17067421027817, "grad_norm": 0.15617455542087555, "learning_rate": 1.5755509166621264e-05, "loss": 0.0316, "num_input_tokens_seen": 28099776, "step": 27935 }, { "epoch": 13.173031588873172, "grad_norm": 2.24574613571167, "learning_rate": 1.574595302207895e-05, "loss": 0.1029, "num_input_tokens_seen": 28104352, "step": 27940 }, { "epoch": 13.175388967468175, "grad_norm": 2.7517008781433105, "learning_rate": 1.573639844408966e-05, "loss": 0.1616, "num_input_tokens_seen": 28108480, "step": 27945 }, { "epoch": 13.177746346063177, "grad_norm": 0.16674110293388367, "learning_rate": 1.5726845434270827e-05, "loss": 0.2537, "num_input_tokens_seen": 28113152, "step": 27950 }, { "epoch": 13.18010372465818, "grad_norm": 1.2365305423736572, "learning_rate": 1.571729399423961e-05, "loss": 0.0498, "num_input_tokens_seen": 28118720, "step": 27955 }, { "epoch": 13.182461103253182, "grad_norm": 0.045323461294174194, "learning_rate": 1.5707744125612915e-05, "loss": 0.148, "num_input_tokens_seen": 28124512, "step": 27960 }, { "epoch": 13.184818481848184, "grad_norm": 0.1633748710155487, "learning_rate": 1.5698195830007363e-05, "loss": 0.1151, "num_input_tokens_seen": 28130528, "step": 27965 }, { "epoch": 13.187175860443187, "grad_norm": 1.7472983598709106, "learning_rate": 1.568864910903932e-05, "loss": 0.1705, "num_input_tokens_seen": 28135040, "step": 27970 }, { "epoch": 13.18953323903819, "grad_norm": 0.6326723694801331, "learning_rate": 1.5679103964324884e-05, "loss": 0.0648, "num_input_tokens_seen": 28140096, "step": 27975 }, { "epoch": 13.191890617633192, "grad_norm": 3.454939365386963, "learning_rate": 1.5669560397479894e-05, "loss": 0.0949, "num_input_tokens_seen": 28145248, "step": 27980 }, { "epoch": 13.194247996228194, "grad_norm": 1.857779622077942, "learning_rate": 1.56600184101199e-05, "loss": 0.2867, "num_input_tokens_seen": 28149696, "step": 27985 }, { "epoch": 13.196605374823196, "grad_norm": 0.2507436275482178, "learning_rate": 1.565047800386021e-05, "loss": 0.2572, "num_input_tokens_seen": 28155488, "step": 27990 }, { "epoch": 13.198962753418199, "grad_norm": 0.5408629179000854, "learning_rate": 1.5640939180315843e-05, "loss": 0.0817, "num_input_tokens_seen": 28160000, "step": 27995 }, { "epoch": 13.201320132013201, "grad_norm": 0.3583332896232605, "learning_rate": 1.563140194110157e-05, "loss": 0.0188, "num_input_tokens_seen": 28165920, "step": 28000 }, { "epoch": 13.203677510608204, "grad_norm": 1.1720740795135498, "learning_rate": 1.5621866287831872e-05, "loss": 0.1451, "num_input_tokens_seen": 28170880, "step": 28005 }, { "epoch": 13.206034889203206, "grad_norm": 0.5196394920349121, "learning_rate": 1.5612332222120984e-05, "loss": 0.1746, "num_input_tokens_seen": 28176416, "step": 28010 }, { "epoch": 13.208392267798208, "grad_norm": 1.348212480545044, "learning_rate": 1.560279974558286e-05, "loss": 0.0852, "num_input_tokens_seen": 28181152, "step": 28015 }, { "epoch": 13.21074964639321, "grad_norm": 0.11077004671096802, "learning_rate": 1.5593268859831182e-05, "loss": 0.0835, "num_input_tokens_seen": 28185344, "step": 28020 }, { "epoch": 13.213107024988213, "grad_norm": 1.4189516305923462, "learning_rate": 1.5583739566479367e-05, "loss": 0.0802, "num_input_tokens_seen": 28189568, "step": 28025 }, { "epoch": 13.215464403583216, "grad_norm": 0.23789478838443756, "learning_rate": 1.5574211867140565e-05, "loss": 0.2036, "num_input_tokens_seen": 28193824, "step": 28030 }, { "epoch": 13.217821782178218, "grad_norm": 0.951287567615509, "learning_rate": 1.556468576342765e-05, "loss": 0.07, "num_input_tokens_seen": 28198336, "step": 28035 }, { "epoch": 13.22017916077322, "grad_norm": 0.12547621130943298, "learning_rate": 1.555516125695324e-05, "loss": 0.0432, "num_input_tokens_seen": 28203776, "step": 28040 }, { "epoch": 13.222536539368223, "grad_norm": 0.8375582695007324, "learning_rate": 1.5545638349329662e-05, "loss": 0.0665, "num_input_tokens_seen": 28210560, "step": 28045 }, { "epoch": 13.224893917963225, "grad_norm": 0.5425445437431335, "learning_rate": 1.553611704216898e-05, "loss": 0.1275, "num_input_tokens_seen": 28215648, "step": 28050 }, { "epoch": 13.227251296558228, "grad_norm": 0.3554697632789612, "learning_rate": 1.5526597337083e-05, "loss": 0.1729, "num_input_tokens_seen": 28220992, "step": 28055 }, { "epoch": 13.22960867515323, "grad_norm": 2.6447815895080566, "learning_rate": 1.5517079235683245e-05, "loss": 0.4442, "num_input_tokens_seen": 28225376, "step": 28060 }, { "epoch": 13.231966053748232, "grad_norm": 0.04947246238589287, "learning_rate": 1.550756273958096e-05, "loss": 0.0737, "num_input_tokens_seen": 28230144, "step": 28065 }, { "epoch": 13.234323432343235, "grad_norm": 0.5311762094497681, "learning_rate": 1.549804785038713e-05, "loss": 0.0715, "num_input_tokens_seen": 28234240, "step": 28070 }, { "epoch": 13.236680810938237, "grad_norm": 1.1980133056640625, "learning_rate": 1.5488534569712465e-05, "loss": 0.163, "num_input_tokens_seen": 28241248, "step": 28075 }, { "epoch": 13.23903818953324, "grad_norm": 0.19158414006233215, "learning_rate": 1.5479022899167407e-05, "loss": 0.1323, "num_input_tokens_seen": 28245504, "step": 28080 }, { "epoch": 13.241395568128242, "grad_norm": 0.3121155798435211, "learning_rate": 1.5469512840362117e-05, "loss": 0.0609, "num_input_tokens_seen": 28250272, "step": 28085 }, { "epoch": 13.243752946723244, "grad_norm": 0.15696083009243011, "learning_rate": 1.5460004394906484e-05, "loss": 0.0789, "num_input_tokens_seen": 28255424, "step": 28090 }, { "epoch": 13.246110325318247, "grad_norm": 1.797697901725769, "learning_rate": 1.545049756441012e-05, "loss": 0.073, "num_input_tokens_seen": 28261728, "step": 28095 }, { "epoch": 13.24846770391325, "grad_norm": 1.0213871002197266, "learning_rate": 1.5440992350482377e-05, "loss": 0.0908, "num_input_tokens_seen": 28266464, "step": 28100 }, { "epoch": 13.250825082508252, "grad_norm": 0.7882790565490723, "learning_rate": 1.543148875473233e-05, "loss": 0.1528, "num_input_tokens_seen": 28271968, "step": 28105 }, { "epoch": 13.253182461103254, "grad_norm": 0.3001231253147125, "learning_rate": 1.5421986778768774e-05, "loss": 0.1119, "num_input_tokens_seen": 28276224, "step": 28110 }, { "epoch": 13.255539839698255, "grad_norm": 0.06636443734169006, "learning_rate": 1.541248642420023e-05, "loss": 0.2031, "num_input_tokens_seen": 28281088, "step": 28115 }, { "epoch": 13.257897218293257, "grad_norm": 0.2554512619972229, "learning_rate": 1.540298769263494e-05, "loss": 0.2592, "num_input_tokens_seen": 28288000, "step": 28120 }, { "epoch": 13.26025459688826, "grad_norm": 1.389951467514038, "learning_rate": 1.539349058568089e-05, "loss": 0.109, "num_input_tokens_seen": 28293472, "step": 28125 }, { "epoch": 13.262611975483262, "grad_norm": 0.7691171169281006, "learning_rate": 1.538399510494578e-05, "loss": 0.1026, "num_input_tokens_seen": 28298976, "step": 28130 }, { "epoch": 13.264969354078264, "grad_norm": 0.028217362239956856, "learning_rate": 1.5374501252037028e-05, "loss": 0.3376, "num_input_tokens_seen": 28303840, "step": 28135 }, { "epoch": 13.267326732673267, "grad_norm": 0.7293241024017334, "learning_rate": 1.536500902856178e-05, "loss": 0.0827, "num_input_tokens_seen": 28308864, "step": 28140 }, { "epoch": 13.269684111268269, "grad_norm": 2.262446165084839, "learning_rate": 1.5355518436126902e-05, "loss": 0.2335, "num_input_tokens_seen": 28313920, "step": 28145 }, { "epoch": 13.272041489863271, "grad_norm": 0.09137971699237823, "learning_rate": 1.5346029476339015e-05, "loss": 0.2831, "num_input_tokens_seen": 28319232, "step": 28150 }, { "epoch": 13.274398868458274, "grad_norm": 1.7467011213302612, "learning_rate": 1.533654215080441e-05, "loss": 0.1717, "num_input_tokens_seen": 28324992, "step": 28155 }, { "epoch": 13.276756247053276, "grad_norm": 2.70531964302063, "learning_rate": 1.5327056461129145e-05, "loss": 0.1388, "num_input_tokens_seen": 28329536, "step": 28160 }, { "epoch": 13.279113625648279, "grad_norm": 0.5511080026626587, "learning_rate": 1.5317572408918983e-05, "loss": 0.0773, "num_input_tokens_seen": 28334208, "step": 28165 }, { "epoch": 13.281471004243281, "grad_norm": 2.20389723777771, "learning_rate": 1.5308089995779424e-05, "loss": 0.0893, "num_input_tokens_seen": 28338784, "step": 28170 }, { "epoch": 13.283828382838283, "grad_norm": 0.04090970754623413, "learning_rate": 1.5298609223315657e-05, "loss": 0.0381, "num_input_tokens_seen": 28344096, "step": 28175 }, { "epoch": 13.286185761433286, "grad_norm": 2.2712910175323486, "learning_rate": 1.5289130093132632e-05, "loss": 0.1517, "num_input_tokens_seen": 28348992, "step": 28180 }, { "epoch": 13.288543140028288, "grad_norm": 0.08683744817972183, "learning_rate": 1.5279652606835004e-05, "loss": 0.018, "num_input_tokens_seen": 28355936, "step": 28185 }, { "epoch": 13.29090051862329, "grad_norm": 1.6275566816329956, "learning_rate": 1.527017676602715e-05, "loss": 0.1816, "num_input_tokens_seen": 28361120, "step": 28190 }, { "epoch": 13.293257897218293, "grad_norm": 0.21873195469379425, "learning_rate": 1.526070257231316e-05, "loss": 0.2076, "num_input_tokens_seen": 28366272, "step": 28195 }, { "epoch": 13.295615275813295, "grad_norm": 0.6413061618804932, "learning_rate": 1.5251230027296861e-05, "loss": 0.035, "num_input_tokens_seen": 28370368, "step": 28200 }, { "epoch": 13.297972654408298, "grad_norm": 0.0914345532655716, "learning_rate": 1.5241759132581795e-05, "loss": 0.0377, "num_input_tokens_seen": 28375328, "step": 28205 }, { "epoch": 13.3003300330033, "grad_norm": 0.15514051914215088, "learning_rate": 1.523228988977123e-05, "loss": 0.0543, "num_input_tokens_seen": 28380256, "step": 28210 }, { "epoch": 13.302687411598303, "grad_norm": 1.6849020719528198, "learning_rate": 1.5222822300468129e-05, "loss": 0.0604, "num_input_tokens_seen": 28385536, "step": 28215 }, { "epoch": 13.305044790193305, "grad_norm": 1.3481330871582031, "learning_rate": 1.5213356366275209e-05, "loss": 0.0613, "num_input_tokens_seen": 28389632, "step": 28220 }, { "epoch": 13.307402168788308, "grad_norm": 1.1007556915283203, "learning_rate": 1.5203892088794889e-05, "loss": 0.0666, "num_input_tokens_seen": 28395072, "step": 28225 }, { "epoch": 13.30975954738331, "grad_norm": 1.893528699874878, "learning_rate": 1.5194429469629312e-05, "loss": 0.1873, "num_input_tokens_seen": 28399904, "step": 28230 }, { "epoch": 13.312116925978312, "grad_norm": 0.025234280154109, "learning_rate": 1.5184968510380335e-05, "loss": 0.0959, "num_input_tokens_seen": 28403872, "step": 28235 }, { "epoch": 13.314474304573315, "grad_norm": 1.0996570587158203, "learning_rate": 1.5175509212649536e-05, "loss": 0.1056, "num_input_tokens_seen": 28408576, "step": 28240 }, { "epoch": 13.316831683168317, "grad_norm": 0.8922435641288757, "learning_rate": 1.516605157803822e-05, "loss": 0.0408, "num_input_tokens_seen": 28414304, "step": 28245 }, { "epoch": 13.31918906176332, "grad_norm": 0.45069223642349243, "learning_rate": 1.5156595608147406e-05, "loss": 0.2177, "num_input_tokens_seen": 28418592, "step": 28250 }, { "epoch": 13.321546440358322, "grad_norm": 0.13472072780132294, "learning_rate": 1.5147141304577816e-05, "loss": 0.0502, "num_input_tokens_seen": 28423456, "step": 28255 }, { "epoch": 13.323903818953324, "grad_norm": 1.3524590730667114, "learning_rate": 1.5137688668929918e-05, "loss": 0.1562, "num_input_tokens_seen": 28428480, "step": 28260 }, { "epoch": 13.326261197548327, "grad_norm": 1.0881222486495972, "learning_rate": 1.5128237702803868e-05, "loss": 0.1179, "num_input_tokens_seen": 28433120, "step": 28265 }, { "epoch": 13.32861857614333, "grad_norm": 0.27510368824005127, "learning_rate": 1.511878840779957e-05, "loss": 0.0446, "num_input_tokens_seen": 28438560, "step": 28270 }, { "epoch": 13.330975954738332, "grad_norm": 1.8060401678085327, "learning_rate": 1.5109340785516616e-05, "loss": 0.0847, "num_input_tokens_seen": 28444000, "step": 28275 }, { "epoch": 13.333333333333334, "grad_norm": 2.201653003692627, "learning_rate": 1.5099894837554335e-05, "loss": 0.1419, "num_input_tokens_seen": 28449088, "step": 28280 }, { "epoch": 13.335690711928336, "grad_norm": 0.28905361890792847, "learning_rate": 1.509045056551176e-05, "loss": 0.0453, "num_input_tokens_seen": 28454688, "step": 28285 }, { "epoch": 13.338048090523339, "grad_norm": 0.05568305030465126, "learning_rate": 1.5081007970987657e-05, "loss": 0.1226, "num_input_tokens_seen": 28460224, "step": 28290 }, { "epoch": 13.340405469118341, "grad_norm": 0.37700074911117554, "learning_rate": 1.5071567055580482e-05, "loss": 0.0572, "num_input_tokens_seen": 28464704, "step": 28295 }, { "epoch": 13.342762847713344, "grad_norm": 0.2037971317768097, "learning_rate": 1.5062127820888433e-05, "loss": 0.0805, "num_input_tokens_seen": 28469952, "step": 28300 }, { "epoch": 13.345120226308346, "grad_norm": 0.251396507024765, "learning_rate": 1.5052690268509407e-05, "loss": 0.0145, "num_input_tokens_seen": 28477312, "step": 28305 }, { "epoch": 13.347477604903348, "grad_norm": 0.06116997078061104, "learning_rate": 1.5043254400041035e-05, "loss": 0.1747, "num_input_tokens_seen": 28482336, "step": 28310 }, { "epoch": 13.34983498349835, "grad_norm": 0.1720583438873291, "learning_rate": 1.5033820217080629e-05, "loss": 0.0975, "num_input_tokens_seen": 28487456, "step": 28315 }, { "epoch": 13.352192362093351, "grad_norm": 0.1673116236925125, "learning_rate": 1.5024387721225242e-05, "loss": 0.1811, "num_input_tokens_seen": 28491520, "step": 28320 }, { "epoch": 13.354549740688354, "grad_norm": 0.5959230661392212, "learning_rate": 1.5014956914071635e-05, "loss": 0.0377, "num_input_tokens_seen": 28495808, "step": 28325 }, { "epoch": 13.356907119283356, "grad_norm": 1.9362736940383911, "learning_rate": 1.5005527797216284e-05, "loss": 0.2769, "num_input_tokens_seen": 28500768, "step": 28330 }, { "epoch": 13.359264497878359, "grad_norm": 0.16470037400722504, "learning_rate": 1.499610037225539e-05, "loss": 0.0207, "num_input_tokens_seen": 28505376, "step": 28335 }, { "epoch": 13.361621876473361, "grad_norm": 1.8950921297073364, "learning_rate": 1.498667464078484e-05, "loss": 0.176, "num_input_tokens_seen": 28511296, "step": 28340 }, { "epoch": 13.363979255068363, "grad_norm": 0.40865302085876465, "learning_rate": 1.4977250604400256e-05, "loss": 0.0984, "num_input_tokens_seen": 28516224, "step": 28345 }, { "epoch": 13.366336633663366, "grad_norm": 1.0940675735473633, "learning_rate": 1.496782826469697e-05, "loss": 0.1396, "num_input_tokens_seen": 28522848, "step": 28350 }, { "epoch": 13.368694012258368, "grad_norm": 0.25789979100227356, "learning_rate": 1.4958407623270027e-05, "loss": 0.0414, "num_input_tokens_seen": 28527168, "step": 28355 }, { "epoch": 13.37105139085337, "grad_norm": 0.2658991813659668, "learning_rate": 1.4948988681714168e-05, "loss": 0.0984, "num_input_tokens_seen": 28531968, "step": 28360 }, { "epoch": 13.373408769448373, "grad_norm": 0.4723286032676697, "learning_rate": 1.4939571441623874e-05, "loss": 0.0598, "num_input_tokens_seen": 28537216, "step": 28365 }, { "epoch": 13.375766148043375, "grad_norm": 0.260396271944046, "learning_rate": 1.4930155904593312e-05, "loss": 0.1123, "num_input_tokens_seen": 28542464, "step": 28370 }, { "epoch": 13.378123526638378, "grad_norm": 0.7057055234909058, "learning_rate": 1.492074207221639e-05, "loss": 0.0346, "num_input_tokens_seen": 28548064, "step": 28375 }, { "epoch": 13.38048090523338, "grad_norm": 1.3708213567733765, "learning_rate": 1.4911329946086691e-05, "loss": 0.0537, "num_input_tokens_seen": 28552192, "step": 28380 }, { "epoch": 13.382838283828383, "grad_norm": 0.5742071866989136, "learning_rate": 1.4901919527797536e-05, "loss": 0.255, "num_input_tokens_seen": 28557504, "step": 28385 }, { "epoch": 13.385195662423385, "grad_norm": 0.05129808560013771, "learning_rate": 1.489251081894195e-05, "loss": 0.116, "num_input_tokens_seen": 28562528, "step": 28390 }, { "epoch": 13.387553041018387, "grad_norm": 1.315295696258545, "learning_rate": 1.4883103821112677e-05, "loss": 0.1488, "num_input_tokens_seen": 28567040, "step": 28395 }, { "epoch": 13.38991041961339, "grad_norm": 2.0776920318603516, "learning_rate": 1.4873698535902142e-05, "loss": 0.1568, "num_input_tokens_seen": 28572192, "step": 28400 }, { "epoch": 13.392267798208392, "grad_norm": 3.508877754211426, "learning_rate": 1.486429496490251e-05, "loss": 0.2838, "num_input_tokens_seen": 28577312, "step": 28405 }, { "epoch": 13.394625176803395, "grad_norm": 0.10737941414117813, "learning_rate": 1.4854893109705647e-05, "loss": 0.0359, "num_input_tokens_seen": 28583040, "step": 28410 }, { "epoch": 13.396982555398397, "grad_norm": 0.4988805949687958, "learning_rate": 1.484549297190313e-05, "loss": 0.1137, "num_input_tokens_seen": 28588544, "step": 28415 }, { "epoch": 13.3993399339934, "grad_norm": 0.2417876124382019, "learning_rate": 1.4836094553086235e-05, "loss": 0.1474, "num_input_tokens_seen": 28592736, "step": 28420 }, { "epoch": 13.401697312588402, "grad_norm": 2.8559277057647705, "learning_rate": 1.482669785484596e-05, "loss": 0.3304, "num_input_tokens_seen": 28597632, "step": 28425 }, { "epoch": 13.404054691183404, "grad_norm": 0.9040433764457703, "learning_rate": 1.4817302878773003e-05, "loss": 0.2248, "num_input_tokens_seen": 28602272, "step": 28430 }, { "epoch": 13.406412069778407, "grad_norm": 1.8936864137649536, "learning_rate": 1.4807909626457782e-05, "loss": 0.1216, "num_input_tokens_seen": 28608672, "step": 28435 }, { "epoch": 13.408769448373409, "grad_norm": 1.0155422687530518, "learning_rate": 1.4798518099490405e-05, "loss": 0.175, "num_input_tokens_seen": 28613792, "step": 28440 }, { "epoch": 13.411126826968411, "grad_norm": 0.2698638141155243, "learning_rate": 1.4789128299460703e-05, "loss": 0.07, "num_input_tokens_seen": 28619520, "step": 28445 }, { "epoch": 13.413484205563414, "grad_norm": 0.7720139026641846, "learning_rate": 1.477974022795821e-05, "loss": 0.1027, "num_input_tokens_seen": 28623872, "step": 28450 }, { "epoch": 13.415841584158416, "grad_norm": 1.251067042350769, "learning_rate": 1.4770353886572169e-05, "loss": 0.0505, "num_input_tokens_seen": 28629344, "step": 28455 }, { "epoch": 13.418198962753419, "grad_norm": 2.359661817550659, "learning_rate": 1.4760969276891523e-05, "loss": 0.0585, "num_input_tokens_seen": 28633504, "step": 28460 }, { "epoch": 13.420556341348421, "grad_norm": 0.5949262380599976, "learning_rate": 1.4751586400504927e-05, "loss": 0.0801, "num_input_tokens_seen": 28637888, "step": 28465 }, { "epoch": 13.422913719943423, "grad_norm": 1.443530797958374, "learning_rate": 1.4742205259000747e-05, "loss": 0.0911, "num_input_tokens_seen": 28642592, "step": 28470 }, { "epoch": 13.425271098538426, "grad_norm": 0.42010483145713806, "learning_rate": 1.4732825853967053e-05, "loss": 0.037, "num_input_tokens_seen": 28648224, "step": 28475 }, { "epoch": 13.427628477133428, "grad_norm": 0.36760959029197693, "learning_rate": 1.4723448186991612e-05, "loss": 0.1929, "num_input_tokens_seen": 28653600, "step": 28480 }, { "epoch": 13.42998585572843, "grad_norm": 1.8290908336639404, "learning_rate": 1.4714072259661904e-05, "loss": 0.0655, "num_input_tokens_seen": 28659520, "step": 28485 }, { "epoch": 13.432343234323433, "grad_norm": 2.593447685241699, "learning_rate": 1.4704698073565121e-05, "loss": 0.0756, "num_input_tokens_seen": 28664352, "step": 28490 }, { "epoch": 13.434700612918435, "grad_norm": 0.2338738888502121, "learning_rate": 1.4695325630288151e-05, "loss": 0.0241, "num_input_tokens_seen": 28669184, "step": 28495 }, { "epoch": 13.437057991513438, "grad_norm": 2.0759685039520264, "learning_rate": 1.4685954931417587e-05, "loss": 0.123, "num_input_tokens_seen": 28673952, "step": 28500 }, { "epoch": 13.43941537010844, "grad_norm": 1.0760911703109741, "learning_rate": 1.4676585978539729e-05, "loss": 0.2198, "num_input_tokens_seen": 28679712, "step": 28505 }, { "epoch": 13.441772748703443, "grad_norm": 1.462439775466919, "learning_rate": 1.4667218773240582e-05, "loss": 0.0632, "num_input_tokens_seen": 28684768, "step": 28510 }, { "epoch": 13.444130127298443, "grad_norm": 2.647054433822632, "learning_rate": 1.4657853317105863e-05, "loss": 0.1939, "num_input_tokens_seen": 28689248, "step": 28515 }, { "epoch": 13.446487505893446, "grad_norm": 0.3857646584510803, "learning_rate": 1.4648489611720973e-05, "loss": 0.0707, "num_input_tokens_seen": 28693728, "step": 28520 }, { "epoch": 13.448844884488448, "grad_norm": 0.485397070646286, "learning_rate": 1.4639127658671032e-05, "loss": 0.1567, "num_input_tokens_seen": 28699584, "step": 28525 }, { "epoch": 13.45120226308345, "grad_norm": 0.48708468675613403, "learning_rate": 1.4629767459540861e-05, "loss": 0.1533, "num_input_tokens_seen": 28704160, "step": 28530 }, { "epoch": 13.453559641678453, "grad_norm": 3.55869197845459, "learning_rate": 1.462040901591499e-05, "loss": 0.2409, "num_input_tokens_seen": 28709280, "step": 28535 }, { "epoch": 13.455917020273455, "grad_norm": 0.1276678591966629, "learning_rate": 1.4611052329377628e-05, "loss": 0.295, "num_input_tokens_seen": 28714176, "step": 28540 }, { "epoch": 13.458274398868458, "grad_norm": 0.09194603562355042, "learning_rate": 1.460169740151272e-05, "loss": 0.041, "num_input_tokens_seen": 28718944, "step": 28545 }, { "epoch": 13.46063177746346, "grad_norm": 0.38620761036872864, "learning_rate": 1.4592344233903879e-05, "loss": 0.121, "num_input_tokens_seen": 28723616, "step": 28550 }, { "epoch": 13.462989156058462, "grad_norm": 1.3944249153137207, "learning_rate": 1.4582992828134462e-05, "loss": 0.0682, "num_input_tokens_seen": 28727648, "step": 28555 }, { "epoch": 13.465346534653465, "grad_norm": 0.522464394569397, "learning_rate": 1.4573643185787478e-05, "loss": 0.0406, "num_input_tokens_seen": 28734240, "step": 28560 }, { "epoch": 13.467703913248467, "grad_norm": 0.2569176256656647, "learning_rate": 1.4564295308445669e-05, "loss": 0.1236, "num_input_tokens_seen": 28740832, "step": 28565 }, { "epoch": 13.47006129184347, "grad_norm": 0.4652734398841858, "learning_rate": 1.455494919769148e-05, "loss": 0.0379, "num_input_tokens_seen": 28745824, "step": 28570 }, { "epoch": 13.472418670438472, "grad_norm": 0.22334647178649902, "learning_rate": 1.4545604855107048e-05, "loss": 0.0194, "num_input_tokens_seen": 28751424, "step": 28575 }, { "epoch": 13.474776049033474, "grad_norm": 1.119052529335022, "learning_rate": 1.4536262282274205e-05, "loss": 0.0445, "num_input_tokens_seen": 28755712, "step": 28580 }, { "epoch": 13.477133427628477, "grad_norm": 1.2267765998840332, "learning_rate": 1.4526921480774492e-05, "loss": 0.1278, "num_input_tokens_seen": 28761024, "step": 28585 }, { "epoch": 13.47949080622348, "grad_norm": 0.2204579859972, "learning_rate": 1.4517582452189153e-05, "loss": 0.0736, "num_input_tokens_seen": 28766144, "step": 28590 }, { "epoch": 13.481848184818482, "grad_norm": 0.06357379257678986, "learning_rate": 1.450824519809913e-05, "loss": 0.0151, "num_input_tokens_seen": 28770816, "step": 28595 }, { "epoch": 13.484205563413484, "grad_norm": 0.27747437357902527, "learning_rate": 1.4498909720085047e-05, "loss": 0.14, "num_input_tokens_seen": 28776320, "step": 28600 }, { "epoch": 13.486562942008486, "grad_norm": 3.720069169998169, "learning_rate": 1.448957601972725e-05, "loss": 0.082, "num_input_tokens_seen": 28781792, "step": 28605 }, { "epoch": 13.488920320603489, "grad_norm": 0.7240026593208313, "learning_rate": 1.4480244098605778e-05, "loss": 0.1114, "num_input_tokens_seen": 28787104, "step": 28610 }, { "epoch": 13.491277699198491, "grad_norm": 1.1519442796707153, "learning_rate": 1.4470913958300362e-05, "loss": 0.0635, "num_input_tokens_seen": 28791808, "step": 28615 }, { "epoch": 13.493635077793494, "grad_norm": 0.25984060764312744, "learning_rate": 1.4461585600390437e-05, "loss": 0.0676, "num_input_tokens_seen": 28797344, "step": 28620 }, { "epoch": 13.495992456388496, "grad_norm": 0.19377802312374115, "learning_rate": 1.4452259026455143e-05, "loss": 0.2442, "num_input_tokens_seen": 28801632, "step": 28625 }, { "epoch": 13.498349834983498, "grad_norm": 0.8975381255149841, "learning_rate": 1.4442934238073302e-05, "loss": 0.0356, "num_input_tokens_seen": 28806560, "step": 28630 }, { "epoch": 13.500707213578501, "grad_norm": 2.211790084838867, "learning_rate": 1.4433611236823458e-05, "loss": 0.2732, "num_input_tokens_seen": 28811808, "step": 28635 }, { "epoch": 13.503064592173503, "grad_norm": 0.4470185935497284, "learning_rate": 1.4424290024283812e-05, "loss": 0.1502, "num_input_tokens_seen": 28816160, "step": 28640 }, { "epoch": 13.505421970768506, "grad_norm": 0.012200675904750824, "learning_rate": 1.4414970602032302e-05, "loss": 0.1112, "num_input_tokens_seen": 28822080, "step": 28645 }, { "epoch": 13.507779349363508, "grad_norm": 0.3693697452545166, "learning_rate": 1.4405652971646544e-05, "loss": 0.0449, "num_input_tokens_seen": 28827264, "step": 28650 }, { "epoch": 13.51013672795851, "grad_norm": 0.10472443699836731, "learning_rate": 1.4396337134703857e-05, "loss": 0.0753, "num_input_tokens_seen": 28832224, "step": 28655 }, { "epoch": 13.512494106553513, "grad_norm": 0.9494622349739075, "learning_rate": 1.4387023092781252e-05, "loss": 0.0452, "num_input_tokens_seen": 28837216, "step": 28660 }, { "epoch": 13.514851485148515, "grad_norm": 1.4328120946884155, "learning_rate": 1.4377710847455439e-05, "loss": 0.1006, "num_input_tokens_seen": 28841344, "step": 28665 }, { "epoch": 13.517208863743518, "grad_norm": 2.6767008304595947, "learning_rate": 1.4368400400302825e-05, "loss": 0.2593, "num_input_tokens_seen": 28846016, "step": 28670 }, { "epoch": 13.51956624233852, "grad_norm": 0.06501052528619766, "learning_rate": 1.4359091752899518e-05, "loss": 0.0107, "num_input_tokens_seen": 28852224, "step": 28675 }, { "epoch": 13.521923620933523, "grad_norm": 0.43297284841537476, "learning_rate": 1.4349784906821295e-05, "loss": 0.0896, "num_input_tokens_seen": 28857024, "step": 28680 }, { "epoch": 13.524280999528525, "grad_norm": 0.17647679150104523, "learning_rate": 1.4340479863643658e-05, "loss": 0.0939, "num_input_tokens_seen": 28861472, "step": 28685 }, { "epoch": 13.526638378123527, "grad_norm": 0.5923174023628235, "learning_rate": 1.433117662494179e-05, "loss": 0.037, "num_input_tokens_seen": 28865984, "step": 28690 }, { "epoch": 13.52899575671853, "grad_norm": 0.9662690758705139, "learning_rate": 1.4321875192290573e-05, "loss": 0.0774, "num_input_tokens_seen": 28871200, "step": 28695 }, { "epoch": 13.531353135313532, "grad_norm": 0.15984708070755005, "learning_rate": 1.4312575567264586e-05, "loss": 0.1923, "num_input_tokens_seen": 28876576, "step": 28700 }, { "epoch": 13.533710513908535, "grad_norm": 1.3784233331680298, "learning_rate": 1.4303277751438087e-05, "loss": 0.1402, "num_input_tokens_seen": 28882496, "step": 28705 }, { "epoch": 13.536067892503535, "grad_norm": 2.6135013103485107, "learning_rate": 1.4293981746385049e-05, "loss": 0.1361, "num_input_tokens_seen": 28887360, "step": 28710 }, { "epoch": 13.53842527109854, "grad_norm": 0.7224027514457703, "learning_rate": 1.4284687553679132e-05, "loss": 0.1348, "num_input_tokens_seen": 28892576, "step": 28715 }, { "epoch": 13.54078264969354, "grad_norm": 0.4446435868740082, "learning_rate": 1.4275395174893666e-05, "loss": 0.1032, "num_input_tokens_seen": 28897472, "step": 28720 }, { "epoch": 13.543140028288542, "grad_norm": 0.2885054051876068, "learning_rate": 1.4266104611601705e-05, "loss": 0.1566, "num_input_tokens_seen": 28903040, "step": 28725 }, { "epoch": 13.545497406883545, "grad_norm": 0.04547109827399254, "learning_rate": 1.4256815865375981e-05, "loss": 0.0838, "num_input_tokens_seen": 28907712, "step": 28730 }, { "epoch": 13.547854785478547, "grad_norm": 0.02150794491171837, "learning_rate": 1.4247528937788921e-05, "loss": 0.1189, "num_input_tokens_seen": 28911968, "step": 28735 }, { "epoch": 13.55021216407355, "grad_norm": 1.0699156522750854, "learning_rate": 1.4238243830412646e-05, "loss": 0.4737, "num_input_tokens_seen": 28917024, "step": 28740 }, { "epoch": 13.552569542668552, "grad_norm": 0.44445717334747314, "learning_rate": 1.4228960544818967e-05, "loss": 0.0583, "num_input_tokens_seen": 28923328, "step": 28745 }, { "epoch": 13.554926921263554, "grad_norm": 0.9625312685966492, "learning_rate": 1.4219679082579385e-05, "loss": 0.198, "num_input_tokens_seen": 28928832, "step": 28750 }, { "epoch": 13.557284299858557, "grad_norm": 0.06749024242162704, "learning_rate": 1.4210399445265104e-05, "loss": 0.1139, "num_input_tokens_seen": 28933600, "step": 28755 }, { "epoch": 13.55964167845356, "grad_norm": 0.2081238180398941, "learning_rate": 1.4201121634446996e-05, "loss": 0.0391, "num_input_tokens_seen": 28938848, "step": 28760 }, { "epoch": 13.561999057048562, "grad_norm": 0.29915186762809753, "learning_rate": 1.4191845651695645e-05, "loss": 0.1555, "num_input_tokens_seen": 28943808, "step": 28765 }, { "epoch": 13.564356435643564, "grad_norm": 0.904871940612793, "learning_rate": 1.4182571498581304e-05, "loss": 0.1065, "num_input_tokens_seen": 28950272, "step": 28770 }, { "epoch": 13.566713814238566, "grad_norm": 0.17702658474445343, "learning_rate": 1.4173299176673938e-05, "loss": 0.0336, "num_input_tokens_seen": 28954720, "step": 28775 }, { "epoch": 13.569071192833569, "grad_norm": 0.9801680445671082, "learning_rate": 1.4164028687543196e-05, "loss": 0.13, "num_input_tokens_seen": 28959840, "step": 28780 }, { "epoch": 13.571428571428571, "grad_norm": 1.348339319229126, "learning_rate": 1.4154760032758412e-05, "loss": 0.16, "num_input_tokens_seen": 28965248, "step": 28785 }, { "epoch": 13.573785950023574, "grad_norm": 0.07950536906719208, "learning_rate": 1.4145493213888616e-05, "loss": 0.0296, "num_input_tokens_seen": 28970592, "step": 28790 }, { "epoch": 13.576143328618576, "grad_norm": 1.0721303224563599, "learning_rate": 1.4136228232502516e-05, "loss": 0.0394, "num_input_tokens_seen": 28976736, "step": 28795 }, { "epoch": 13.578500707213578, "grad_norm": 0.3073047697544098, "learning_rate": 1.412696509016852e-05, "loss": 0.1859, "num_input_tokens_seen": 28982432, "step": 28800 }, { "epoch": 13.58085808580858, "grad_norm": 0.4574936032295227, "learning_rate": 1.4117703788454734e-05, "loss": 0.0429, "num_input_tokens_seen": 28987904, "step": 28805 }, { "epoch": 13.583215464403583, "grad_norm": 0.14183825254440308, "learning_rate": 1.410844432892891e-05, "loss": 0.0773, "num_input_tokens_seen": 28992416, "step": 28810 }, { "epoch": 13.585572842998586, "grad_norm": 0.10821446776390076, "learning_rate": 1.4099186713158538e-05, "loss": 0.0608, "num_input_tokens_seen": 28997568, "step": 28815 }, { "epoch": 13.587930221593588, "grad_norm": 0.9457613229751587, "learning_rate": 1.408993094271077e-05, "loss": 0.0884, "num_input_tokens_seen": 29003168, "step": 28820 }, { "epoch": 13.59028760018859, "grad_norm": 0.05605119839310646, "learning_rate": 1.408067701915245e-05, "loss": 0.0913, "num_input_tokens_seen": 29008128, "step": 28825 }, { "epoch": 13.592644978783593, "grad_norm": 1.175561547279358, "learning_rate": 1.407142494405011e-05, "loss": 0.071, "num_input_tokens_seen": 29013280, "step": 28830 }, { "epoch": 13.595002357378595, "grad_norm": 2.483776092529297, "learning_rate": 1.4062174718969973e-05, "loss": 0.2677, "num_input_tokens_seen": 29018432, "step": 28835 }, { "epoch": 13.597359735973598, "grad_norm": 0.07000412791967392, "learning_rate": 1.405292634547794e-05, "loss": 0.0588, "num_input_tokens_seen": 29024512, "step": 28840 }, { "epoch": 13.5997171145686, "grad_norm": 0.668566107749939, "learning_rate": 1.4043679825139615e-05, "loss": 0.0995, "num_input_tokens_seen": 29029600, "step": 28845 }, { "epoch": 13.602074493163602, "grad_norm": 0.3180711269378662, "learning_rate": 1.403443515952026e-05, "loss": 0.0866, "num_input_tokens_seen": 29035136, "step": 28850 }, { "epoch": 13.604431871758605, "grad_norm": 1.4142255783081055, "learning_rate": 1.4025192350184846e-05, "loss": 0.0818, "num_input_tokens_seen": 29040064, "step": 28855 }, { "epoch": 13.606789250353607, "grad_norm": 1.2568414211273193, "learning_rate": 1.4015951398698027e-05, "loss": 0.2528, "num_input_tokens_seen": 29044576, "step": 28860 }, { "epoch": 13.60914662894861, "grad_norm": 0.028634624555706978, "learning_rate": 1.4006712306624137e-05, "loss": 0.0707, "num_input_tokens_seen": 29050272, "step": 28865 }, { "epoch": 13.611504007543612, "grad_norm": 1.0461838245391846, "learning_rate": 1.39974750755272e-05, "loss": 0.1063, "num_input_tokens_seen": 29055456, "step": 28870 }, { "epoch": 13.613861386138614, "grad_norm": 0.043459732085466385, "learning_rate": 1.3988239706970917e-05, "loss": 0.1511, "num_input_tokens_seen": 29061088, "step": 28875 }, { "epoch": 13.616218764733617, "grad_norm": 0.1623445749282837, "learning_rate": 1.3979006202518685e-05, "loss": 0.0407, "num_input_tokens_seen": 29066880, "step": 28880 }, { "epoch": 13.61857614332862, "grad_norm": 2.339963912963867, "learning_rate": 1.3969774563733585e-05, "loss": 0.3119, "num_input_tokens_seen": 29071712, "step": 28885 }, { "epoch": 13.620933521923622, "grad_norm": 0.4140051305294037, "learning_rate": 1.396054479217836e-05, "loss": 0.0386, "num_input_tokens_seen": 29075744, "step": 28890 }, { "epoch": 13.623290900518624, "grad_norm": 0.14838656783103943, "learning_rate": 1.3951316889415464e-05, "loss": 0.0882, "num_input_tokens_seen": 29081056, "step": 28895 }, { "epoch": 13.625648279113626, "grad_norm": 0.8685768842697144, "learning_rate": 1.3942090857007024e-05, "loss": 0.0849, "num_input_tokens_seen": 29086720, "step": 28900 }, { "epoch": 13.628005657708629, "grad_norm": 0.14705336093902588, "learning_rate": 1.3932866696514846e-05, "loss": 0.0732, "num_input_tokens_seen": 29091424, "step": 28905 }, { "epoch": 13.630363036303631, "grad_norm": 0.38445886969566345, "learning_rate": 1.3923644409500425e-05, "loss": 0.1532, "num_input_tokens_seen": 29097152, "step": 28910 }, { "epoch": 13.632720414898632, "grad_norm": 1.061688780784607, "learning_rate": 1.3914423997524945e-05, "loss": 0.0863, "num_input_tokens_seen": 29102272, "step": 28915 }, { "epoch": 13.635077793493634, "grad_norm": 1.633991003036499, "learning_rate": 1.3905205462149257e-05, "loss": 0.2161, "num_input_tokens_seen": 29107712, "step": 28920 }, { "epoch": 13.637435172088637, "grad_norm": 0.8939423561096191, "learning_rate": 1.3895988804933913e-05, "loss": 0.0807, "num_input_tokens_seen": 29112480, "step": 28925 }, { "epoch": 13.639792550683639, "grad_norm": 0.02388906292617321, "learning_rate": 1.3886774027439124e-05, "loss": 0.0557, "num_input_tokens_seen": 29117248, "step": 28930 }, { "epoch": 13.642149929278641, "grad_norm": 0.8817281126976013, "learning_rate": 1.3877561131224798e-05, "loss": 0.0624, "num_input_tokens_seen": 29122624, "step": 28935 }, { "epoch": 13.644507307873644, "grad_norm": 1.1993927955627441, "learning_rate": 1.386835011785052e-05, "loss": 0.1066, "num_input_tokens_seen": 29126816, "step": 28940 }, { "epoch": 13.646864686468646, "grad_norm": 0.9343817234039307, "learning_rate": 1.3859140988875569e-05, "loss": 0.0397, "num_input_tokens_seen": 29132352, "step": 28945 }, { "epoch": 13.649222065063649, "grad_norm": 0.676903247833252, "learning_rate": 1.3849933745858885e-05, "loss": 0.2056, "num_input_tokens_seen": 29137504, "step": 28950 }, { "epoch": 13.651579443658651, "grad_norm": 0.00514189200475812, "learning_rate": 1.38407283903591e-05, "loss": 0.1189, "num_input_tokens_seen": 29142336, "step": 28955 }, { "epoch": 13.653936822253653, "grad_norm": 1.6557137966156006, "learning_rate": 1.3831524923934525e-05, "loss": 0.3073, "num_input_tokens_seen": 29148448, "step": 28960 }, { "epoch": 13.656294200848656, "grad_norm": 0.11988908797502518, "learning_rate": 1.3822323348143163e-05, "loss": 0.0997, "num_input_tokens_seen": 29153120, "step": 28965 }, { "epoch": 13.658651579443658, "grad_norm": 1.408624291419983, "learning_rate": 1.381312366454266e-05, "loss": 0.2159, "num_input_tokens_seen": 29158624, "step": 28970 }, { "epoch": 13.66100895803866, "grad_norm": 0.29002901911735535, "learning_rate": 1.3803925874690377e-05, "loss": 0.1019, "num_input_tokens_seen": 29164064, "step": 28975 }, { "epoch": 13.663366336633663, "grad_norm": 0.18033139407634735, "learning_rate": 1.3794729980143344e-05, "loss": 0.328, "num_input_tokens_seen": 29168608, "step": 28980 }, { "epoch": 13.665723715228665, "grad_norm": 0.10837230831384659, "learning_rate": 1.3785535982458284e-05, "loss": 0.0317, "num_input_tokens_seen": 29173120, "step": 28985 }, { "epoch": 13.668081093823668, "grad_norm": 0.10170933604240417, "learning_rate": 1.3776343883191562e-05, "loss": 0.1834, "num_input_tokens_seen": 29177920, "step": 28990 }, { "epoch": 13.67043847241867, "grad_norm": 0.04060901701450348, "learning_rate": 1.376715368389925e-05, "loss": 0.0653, "num_input_tokens_seen": 29182528, "step": 28995 }, { "epoch": 13.672795851013673, "grad_norm": 0.0650557205080986, "learning_rate": 1.3757965386137098e-05, "loss": 0.126, "num_input_tokens_seen": 29186368, "step": 29000 }, { "epoch": 13.675153229608675, "grad_norm": 1.8793303966522217, "learning_rate": 1.3748778991460526e-05, "loss": 0.1346, "num_input_tokens_seen": 29191552, "step": 29005 }, { "epoch": 13.677510608203677, "grad_norm": 0.05631324648857117, "learning_rate": 1.3739594501424641e-05, "loss": 0.0395, "num_input_tokens_seen": 29196352, "step": 29010 }, { "epoch": 13.67986798679868, "grad_norm": 0.36978334188461304, "learning_rate": 1.373041191758422e-05, "loss": 0.1731, "num_input_tokens_seen": 29200960, "step": 29015 }, { "epoch": 13.682225365393682, "grad_norm": 0.9876042604446411, "learning_rate": 1.372123124149371e-05, "loss": 0.1987, "num_input_tokens_seen": 29205312, "step": 29020 }, { "epoch": 13.684582743988685, "grad_norm": 0.30207371711730957, "learning_rate": 1.3712052474707243e-05, "loss": 0.0833, "num_input_tokens_seen": 29209952, "step": 29025 }, { "epoch": 13.686940122583687, "grad_norm": 0.1999027132987976, "learning_rate": 1.3702875618778638e-05, "loss": 0.0942, "num_input_tokens_seen": 29215424, "step": 29030 }, { "epoch": 13.68929750117869, "grad_norm": 0.7250345349311829, "learning_rate": 1.3693700675261373e-05, "loss": 0.0369, "num_input_tokens_seen": 29221024, "step": 29035 }, { "epoch": 13.691654879773692, "grad_norm": 0.2182208001613617, "learning_rate": 1.3684527645708616e-05, "loss": 0.0751, "num_input_tokens_seen": 29225824, "step": 29040 }, { "epoch": 13.694012258368694, "grad_norm": 0.5920315980911255, "learning_rate": 1.36753565316732e-05, "loss": 0.0182, "num_input_tokens_seen": 29230144, "step": 29045 }, { "epoch": 13.696369636963697, "grad_norm": 0.12252553552389145, "learning_rate": 1.3666187334707645e-05, "loss": 0.0784, "num_input_tokens_seen": 29237152, "step": 29050 }, { "epoch": 13.698727015558699, "grad_norm": 2.0022032260894775, "learning_rate": 1.3657020056364145e-05, "loss": 0.1419, "num_input_tokens_seen": 29242720, "step": 29055 }, { "epoch": 13.701084394153701, "grad_norm": 0.5268600583076477, "learning_rate": 1.3647854698194545e-05, "loss": 0.0257, "num_input_tokens_seen": 29247104, "step": 29060 }, { "epoch": 13.703441772748704, "grad_norm": 1.3183449506759644, "learning_rate": 1.3638691261750403e-05, "loss": 0.1742, "num_input_tokens_seen": 29251712, "step": 29065 }, { "epoch": 13.705799151343706, "grad_norm": 0.5712764263153076, "learning_rate": 1.362952974858292e-05, "loss": 0.1157, "num_input_tokens_seen": 29255936, "step": 29070 }, { "epoch": 13.708156529938709, "grad_norm": 0.16410194337368011, "learning_rate": 1.3620370160242993e-05, "loss": 0.0498, "num_input_tokens_seen": 29260448, "step": 29075 }, { "epoch": 13.710513908533711, "grad_norm": 0.049301937222480774, "learning_rate": 1.361121249828118e-05, "loss": 0.1318, "num_input_tokens_seen": 29265760, "step": 29080 }, { "epoch": 13.712871287128714, "grad_norm": 1.2999615669250488, "learning_rate": 1.3602056764247722e-05, "loss": 0.0938, "num_input_tokens_seen": 29270208, "step": 29085 }, { "epoch": 13.715228665723716, "grad_norm": 0.050596680492162704, "learning_rate": 1.3592902959692533e-05, "loss": 0.0144, "num_input_tokens_seen": 29275520, "step": 29090 }, { "epoch": 13.717586044318718, "grad_norm": 0.05004073679447174, "learning_rate": 1.3583751086165197e-05, "loss": 0.1455, "num_input_tokens_seen": 29280544, "step": 29095 }, { "epoch": 13.71994342291372, "grad_norm": 0.023114554584026337, "learning_rate": 1.3574601145214954e-05, "loss": 0.0986, "num_input_tokens_seen": 29285568, "step": 29100 }, { "epoch": 13.722300801508723, "grad_norm": 0.2322109192609787, "learning_rate": 1.3565453138390746e-05, "loss": 0.1057, "num_input_tokens_seen": 29291520, "step": 29105 }, { "epoch": 13.724658180103724, "grad_norm": 0.1812194585800171, "learning_rate": 1.3556307067241175e-05, "loss": 0.0645, "num_input_tokens_seen": 29297376, "step": 29110 }, { "epoch": 13.727015558698728, "grad_norm": 0.5654190182685852, "learning_rate": 1.3547162933314514e-05, "loss": 0.2249, "num_input_tokens_seen": 29302080, "step": 29115 }, { "epoch": 13.729372937293729, "grad_norm": 1.2843934297561646, "learning_rate": 1.3538020738158708e-05, "loss": 0.2106, "num_input_tokens_seen": 29307008, "step": 29120 }, { "epoch": 13.731730315888731, "grad_norm": 1.0651649236679077, "learning_rate": 1.3528880483321383e-05, "loss": 0.2031, "num_input_tokens_seen": 29312160, "step": 29125 }, { "epoch": 13.734087694483733, "grad_norm": 1.386203408241272, "learning_rate": 1.351974217034982e-05, "loss": 0.084, "num_input_tokens_seen": 29316416, "step": 29130 }, { "epoch": 13.736445073078736, "grad_norm": 2.306304454803467, "learning_rate": 1.3510605800790994e-05, "loss": 0.1073, "num_input_tokens_seen": 29321344, "step": 29135 }, { "epoch": 13.738802451673738, "grad_norm": 1.4856083393096924, "learning_rate": 1.3501471376191516e-05, "loss": 0.1934, "num_input_tokens_seen": 29327136, "step": 29140 }, { "epoch": 13.74115983026874, "grad_norm": 0.45571598410606384, "learning_rate": 1.3492338898097703e-05, "loss": 0.1369, "num_input_tokens_seen": 29331712, "step": 29145 }, { "epoch": 13.743517208863743, "grad_norm": 1.3267475366592407, "learning_rate": 1.3483208368055524e-05, "loss": 0.2846, "num_input_tokens_seen": 29336512, "step": 29150 }, { "epoch": 13.745874587458745, "grad_norm": 0.06644226610660553, "learning_rate": 1.3474079787610624e-05, "loss": 0.0253, "num_input_tokens_seen": 29340832, "step": 29155 }, { "epoch": 13.748231966053748, "grad_norm": 0.07341130822896957, "learning_rate": 1.3464953158308319e-05, "loss": 0.1728, "num_input_tokens_seen": 29346016, "step": 29160 }, { "epoch": 13.75058934464875, "grad_norm": 0.054606370627880096, "learning_rate": 1.3455828481693589e-05, "loss": 0.0779, "num_input_tokens_seen": 29351488, "step": 29165 }, { "epoch": 13.752946723243753, "grad_norm": 0.14311382174491882, "learning_rate": 1.3446705759311088e-05, "loss": 0.168, "num_input_tokens_seen": 29355712, "step": 29170 }, { "epoch": 13.755304101838755, "grad_norm": 0.15960998833179474, "learning_rate": 1.343758499270515e-05, "loss": 0.1488, "num_input_tokens_seen": 29360256, "step": 29175 }, { "epoch": 13.757661480433757, "grad_norm": 0.40097758173942566, "learning_rate": 1.342846618341974e-05, "loss": 0.0266, "num_input_tokens_seen": 29365088, "step": 29180 }, { "epoch": 13.76001885902876, "grad_norm": 0.03326291963458061, "learning_rate": 1.3419349332998534e-05, "loss": 0.0832, "num_input_tokens_seen": 29369952, "step": 29185 }, { "epoch": 13.762376237623762, "grad_norm": 0.6746661067008972, "learning_rate": 1.3410234442984859e-05, "loss": 0.0439, "num_input_tokens_seen": 29375392, "step": 29190 }, { "epoch": 13.764733616218765, "grad_norm": 0.13158543407917023, "learning_rate": 1.3401121514921708e-05, "loss": 0.1019, "num_input_tokens_seen": 29381024, "step": 29195 }, { "epoch": 13.767090994813767, "grad_norm": 2.462146759033203, "learning_rate": 1.3392010550351747e-05, "loss": 0.2359, "num_input_tokens_seen": 29387232, "step": 29200 }, { "epoch": 13.76944837340877, "grad_norm": 0.28206667304039, "learning_rate": 1.3382901550817309e-05, "loss": 0.1371, "num_input_tokens_seen": 29391232, "step": 29205 }, { "epoch": 13.771805752003772, "grad_norm": 2.451195478439331, "learning_rate": 1.33737945178604e-05, "loss": 0.0783, "num_input_tokens_seen": 29395936, "step": 29210 }, { "epoch": 13.774163130598774, "grad_norm": 0.03836775943636894, "learning_rate": 1.3364689453022666e-05, "loss": 0.0477, "num_input_tokens_seen": 29401600, "step": 29215 }, { "epoch": 13.776520509193777, "grad_norm": 0.04539106786251068, "learning_rate": 1.3355586357845453e-05, "loss": 0.1154, "num_input_tokens_seen": 29407648, "step": 29220 }, { "epoch": 13.778877887788779, "grad_norm": 0.08109341561794281, "learning_rate": 1.3346485233869768e-05, "loss": 0.0466, "num_input_tokens_seen": 29412416, "step": 29225 }, { "epoch": 13.781235266383781, "grad_norm": 1.8187999725341797, "learning_rate": 1.333738608263626e-05, "loss": 0.1048, "num_input_tokens_seen": 29417376, "step": 29230 }, { "epoch": 13.783592644978784, "grad_norm": 0.742836058139801, "learning_rate": 1.3328288905685266e-05, "loss": 0.1052, "num_input_tokens_seen": 29421952, "step": 29235 }, { "epoch": 13.785950023573786, "grad_norm": 0.04295028746128082, "learning_rate": 1.3319193704556786e-05, "loss": 0.0349, "num_input_tokens_seen": 29427232, "step": 29240 }, { "epoch": 13.788307402168789, "grad_norm": 0.06498097628355026, "learning_rate": 1.3310100480790483e-05, "loss": 0.0776, "num_input_tokens_seen": 29431488, "step": 29245 }, { "epoch": 13.790664780763791, "grad_norm": 1.0893348455429077, "learning_rate": 1.3301009235925684e-05, "loss": 0.134, "num_input_tokens_seen": 29437536, "step": 29250 }, { "epoch": 13.793022159358793, "grad_norm": 0.22265827655792236, "learning_rate": 1.3291919971501387e-05, "loss": 0.0409, "num_input_tokens_seen": 29442624, "step": 29255 }, { "epoch": 13.795379537953796, "grad_norm": 0.18302428722381592, "learning_rate": 1.3282832689056246e-05, "loss": 0.0677, "num_input_tokens_seen": 29447904, "step": 29260 }, { "epoch": 13.797736916548798, "grad_norm": 1.3870216608047485, "learning_rate": 1.3273747390128593e-05, "loss": 0.0698, "num_input_tokens_seen": 29453024, "step": 29265 }, { "epoch": 13.8000942951438, "grad_norm": 1.739785075187683, "learning_rate": 1.3264664076256398e-05, "loss": 0.1617, "num_input_tokens_seen": 29458560, "step": 29270 }, { "epoch": 13.802451673738803, "grad_norm": 0.754326581954956, "learning_rate": 1.325558274897732e-05, "loss": 0.0691, "num_input_tokens_seen": 29462688, "step": 29275 }, { "epoch": 13.804809052333805, "grad_norm": 0.8853325843811035, "learning_rate": 1.324650340982867e-05, "loss": 0.0762, "num_input_tokens_seen": 29467680, "step": 29280 }, { "epoch": 13.807166430928808, "grad_norm": 0.06890065222978592, "learning_rate": 1.3237426060347436e-05, "loss": 0.0477, "num_input_tokens_seen": 29471968, "step": 29285 }, { "epoch": 13.80952380952381, "grad_norm": 1.026494026184082, "learning_rate": 1.3228350702070247e-05, "loss": 0.1983, "num_input_tokens_seen": 29477088, "step": 29290 }, { "epoch": 13.811881188118813, "grad_norm": 1.4328950643539429, "learning_rate": 1.3219277336533418e-05, "loss": 0.1669, "num_input_tokens_seen": 29481056, "step": 29295 }, { "epoch": 13.814238566713815, "grad_norm": 0.10142850130796432, "learning_rate": 1.3210205965272909e-05, "loss": 0.0502, "num_input_tokens_seen": 29486016, "step": 29300 }, { "epoch": 13.816595945308817, "grad_norm": 0.892368733882904, "learning_rate": 1.3201136589824359e-05, "loss": 0.0937, "num_input_tokens_seen": 29492096, "step": 29305 }, { "epoch": 13.81895332390382, "grad_norm": 0.3986870348453522, "learning_rate": 1.319206921172304e-05, "loss": 0.0475, "num_input_tokens_seen": 29498752, "step": 29310 }, { "epoch": 13.82131070249882, "grad_norm": 0.6470142602920532, "learning_rate": 1.318300383250392e-05, "loss": 0.0634, "num_input_tokens_seen": 29502720, "step": 29315 }, { "epoch": 13.823668081093825, "grad_norm": 0.5167614817619324, "learning_rate": 1.3173940453701608e-05, "loss": 0.1778, "num_input_tokens_seen": 29506496, "step": 29320 }, { "epoch": 13.826025459688825, "grad_norm": 0.3829822838306427, "learning_rate": 1.3164879076850384e-05, "loss": 0.0438, "num_input_tokens_seen": 29512384, "step": 29325 }, { "epoch": 13.828382838283828, "grad_norm": 0.2739291191101074, "learning_rate": 1.3155819703484179e-05, "loss": 0.0612, "num_input_tokens_seen": 29516800, "step": 29330 }, { "epoch": 13.83074021687883, "grad_norm": 0.18048208951950073, "learning_rate": 1.31467623351366e-05, "loss": 0.1801, "num_input_tokens_seen": 29522368, "step": 29335 }, { "epoch": 13.833097595473832, "grad_norm": 1.505961298942566, "learning_rate": 1.3137706973340899e-05, "loss": 0.1767, "num_input_tokens_seen": 29526720, "step": 29340 }, { "epoch": 13.835454974068835, "grad_norm": 0.04149090498685837, "learning_rate": 1.3128653619630007e-05, "loss": 0.0919, "num_input_tokens_seen": 29531200, "step": 29345 }, { "epoch": 13.837812352663837, "grad_norm": 0.3839355409145355, "learning_rate": 1.3119602275536485e-05, "loss": 0.0876, "num_input_tokens_seen": 29537312, "step": 29350 }, { "epoch": 13.84016973125884, "grad_norm": 0.42016613483428955, "learning_rate": 1.311055294259258e-05, "loss": 0.0594, "num_input_tokens_seen": 29542400, "step": 29355 }, { "epoch": 13.842527109853842, "grad_norm": 0.055239155888557434, "learning_rate": 1.310150562233019e-05, "loss": 0.3815, "num_input_tokens_seen": 29547232, "step": 29360 }, { "epoch": 13.844884488448844, "grad_norm": 0.45364537835121155, "learning_rate": 1.3092460316280869e-05, "loss": 0.1157, "num_input_tokens_seen": 29553888, "step": 29365 }, { "epoch": 13.847241867043847, "grad_norm": 0.9180901050567627, "learning_rate": 1.3083417025975841e-05, "loss": 0.1861, "num_input_tokens_seen": 29558240, "step": 29370 }, { "epoch": 13.84959924563885, "grad_norm": 1.050742506980896, "learning_rate": 1.3074375752945982e-05, "loss": 0.0741, "num_input_tokens_seen": 29563456, "step": 29375 }, { "epoch": 13.851956624233852, "grad_norm": 0.7230445146560669, "learning_rate": 1.306533649872182e-05, "loss": 0.1007, "num_input_tokens_seen": 29568256, "step": 29380 }, { "epoch": 13.854314002828854, "grad_norm": 0.061699770390987396, "learning_rate": 1.3056299264833558e-05, "loss": 0.0897, "num_input_tokens_seen": 29573568, "step": 29385 }, { "epoch": 13.856671381423856, "grad_norm": 0.3143185079097748, "learning_rate": 1.3047264052811028e-05, "loss": 0.071, "num_input_tokens_seen": 29577632, "step": 29390 }, { "epoch": 13.859028760018859, "grad_norm": 0.5525692105293274, "learning_rate": 1.303823086418375e-05, "loss": 0.1342, "num_input_tokens_seen": 29581888, "step": 29395 }, { "epoch": 13.861386138613861, "grad_norm": 0.2589309811592102, "learning_rate": 1.3029199700480887e-05, "loss": 0.0469, "num_input_tokens_seen": 29586848, "step": 29400 }, { "epoch": 13.863743517208864, "grad_norm": 0.11754032224416733, "learning_rate": 1.302017056323126e-05, "loss": 0.1399, "num_input_tokens_seen": 29591616, "step": 29405 }, { "epoch": 13.866100895803866, "grad_norm": 0.07130353152751923, "learning_rate": 1.3011143453963353e-05, "loss": 0.0206, "num_input_tokens_seen": 29596416, "step": 29410 }, { "epoch": 13.868458274398868, "grad_norm": 0.5872856974601746, "learning_rate": 1.3002118374205302e-05, "loss": 0.0572, "num_input_tokens_seen": 29600960, "step": 29415 }, { "epoch": 13.87081565299387, "grad_norm": 1.4981147050857544, "learning_rate": 1.2993095325484894e-05, "loss": 0.2694, "num_input_tokens_seen": 29604736, "step": 29420 }, { "epoch": 13.873173031588873, "grad_norm": 0.3908776342868805, "learning_rate": 1.2984074309329592e-05, "loss": 0.1486, "num_input_tokens_seen": 29608768, "step": 29425 }, { "epoch": 13.875530410183876, "grad_norm": 0.9539837837219238, "learning_rate": 1.2975055327266482e-05, "loss": 0.1932, "num_input_tokens_seen": 29616352, "step": 29430 }, { "epoch": 13.877887788778878, "grad_norm": 0.031283922493457794, "learning_rate": 1.2966038380822344e-05, "loss": 0.1753, "num_input_tokens_seen": 29622912, "step": 29435 }, { "epoch": 13.88024516737388, "grad_norm": 0.1884024739265442, "learning_rate": 1.2957023471523571e-05, "loss": 0.0584, "num_input_tokens_seen": 29627328, "step": 29440 }, { "epoch": 13.882602545968883, "grad_norm": 0.06284944713115692, "learning_rate": 1.2948010600896246e-05, "loss": 0.0586, "num_input_tokens_seen": 29633536, "step": 29445 }, { "epoch": 13.884959924563885, "grad_norm": 0.4041450321674347, "learning_rate": 1.2938999770466098e-05, "loss": 0.093, "num_input_tokens_seen": 29639200, "step": 29450 }, { "epoch": 13.887317303158888, "grad_norm": 1.9482851028442383, "learning_rate": 1.29299909817585e-05, "loss": 0.1534, "num_input_tokens_seen": 29644416, "step": 29455 }, { "epoch": 13.88967468175389, "grad_norm": 0.11727999895811081, "learning_rate": 1.2920984236298494e-05, "loss": 0.1329, "num_input_tokens_seen": 29649216, "step": 29460 }, { "epoch": 13.892032060348892, "grad_norm": 0.7571528553962708, "learning_rate": 1.291197953561077e-05, "loss": 0.2314, "num_input_tokens_seen": 29654528, "step": 29465 }, { "epoch": 13.894389438943895, "grad_norm": 0.026284679770469666, "learning_rate": 1.2902976881219676e-05, "loss": 0.0988, "num_input_tokens_seen": 29659456, "step": 29470 }, { "epoch": 13.896746817538897, "grad_norm": 0.8607159852981567, "learning_rate": 1.289397627464919e-05, "loss": 0.0742, "num_input_tokens_seen": 29664992, "step": 29475 }, { "epoch": 13.8991041961339, "grad_norm": 1.698206901550293, "learning_rate": 1.2884977717422969e-05, "loss": 0.13, "num_input_tokens_seen": 29671776, "step": 29480 }, { "epoch": 13.901461574728902, "grad_norm": 1.949044942855835, "learning_rate": 1.287598121106432e-05, "loss": 0.262, "num_input_tokens_seen": 29676128, "step": 29485 }, { "epoch": 13.903818953323904, "grad_norm": 1.0735834836959839, "learning_rate": 1.28669867570962e-05, "loss": 0.1636, "num_input_tokens_seen": 29680736, "step": 29490 }, { "epoch": 13.906176331918907, "grad_norm": 0.05525168776512146, "learning_rate": 1.2857994357041214e-05, "loss": 0.0485, "num_input_tokens_seen": 29685664, "step": 29495 }, { "epoch": 13.90853371051391, "grad_norm": 0.8135247826576233, "learning_rate": 1.2849004012421626e-05, "loss": 0.0513, "num_input_tokens_seen": 29690272, "step": 29500 }, { "epoch": 13.910891089108912, "grad_norm": 0.41719233989715576, "learning_rate": 1.2840015724759344e-05, "loss": 0.092, "num_input_tokens_seen": 29695904, "step": 29505 }, { "epoch": 13.913248467703912, "grad_norm": 0.3535512387752533, "learning_rate": 1.2831029495575947e-05, "loss": 0.0786, "num_input_tokens_seen": 29699776, "step": 29510 }, { "epoch": 13.915605846298917, "grad_norm": 0.6348496079444885, "learning_rate": 1.2822045326392627e-05, "loss": 0.1039, "num_input_tokens_seen": 29706144, "step": 29515 }, { "epoch": 13.917963224893917, "grad_norm": 1.087302803993225, "learning_rate": 1.2813063218730265e-05, "loss": 0.0389, "num_input_tokens_seen": 29709792, "step": 29520 }, { "epoch": 13.92032060348892, "grad_norm": 0.11944487690925598, "learning_rate": 1.2804083174109377e-05, "loss": 0.0763, "num_input_tokens_seen": 29716512, "step": 29525 }, { "epoch": 13.922677982083922, "grad_norm": 0.20058022439479828, "learning_rate": 1.2795105194050133e-05, "loss": 0.0939, "num_input_tokens_seen": 29721568, "step": 29530 }, { "epoch": 13.925035360678924, "grad_norm": 0.06631022691726685, "learning_rate": 1.278612928007235e-05, "loss": 0.0299, "num_input_tokens_seen": 29725984, "step": 29535 }, { "epoch": 13.927392739273927, "grad_norm": 0.09223200380802155, "learning_rate": 1.2777155433695503e-05, "loss": 0.0868, "num_input_tokens_seen": 29732448, "step": 29540 }, { "epoch": 13.92975011786893, "grad_norm": 1.5835342407226562, "learning_rate": 1.2768183656438709e-05, "loss": 0.1426, "num_input_tokens_seen": 29737088, "step": 29545 }, { "epoch": 13.932107496463932, "grad_norm": 0.09536748379468918, "learning_rate": 1.2759213949820748e-05, "loss": 0.1679, "num_input_tokens_seen": 29741984, "step": 29550 }, { "epoch": 13.934464875058934, "grad_norm": 0.40352851152420044, "learning_rate": 1.2750246315360015e-05, "loss": 0.1847, "num_input_tokens_seen": 29746784, "step": 29555 }, { "epoch": 13.936822253653936, "grad_norm": 3.454348564147949, "learning_rate": 1.2741280754574595e-05, "loss": 0.2549, "num_input_tokens_seen": 29751616, "step": 29560 }, { "epoch": 13.939179632248939, "grad_norm": 3.5542871952056885, "learning_rate": 1.27323172689822e-05, "loss": 0.1952, "num_input_tokens_seen": 29755936, "step": 29565 }, { "epoch": 13.941537010843941, "grad_norm": 0.05605584755539894, "learning_rate": 1.27233558601002e-05, "loss": 0.0305, "num_input_tokens_seen": 29759936, "step": 29570 }, { "epoch": 13.943894389438944, "grad_norm": 0.1560496836900711, "learning_rate": 1.2714396529445605e-05, "loss": 0.1503, "num_input_tokens_seen": 29765632, "step": 29575 }, { "epoch": 13.946251768033946, "grad_norm": 0.26690247654914856, "learning_rate": 1.270543927853508e-05, "loss": 0.0588, "num_input_tokens_seen": 29770656, "step": 29580 }, { "epoch": 13.948609146628948, "grad_norm": 0.42241668701171875, "learning_rate": 1.2696484108884938e-05, "loss": 0.1958, "num_input_tokens_seen": 29775456, "step": 29585 }, { "epoch": 13.95096652522395, "grad_norm": 0.09769896417856216, "learning_rate": 1.2687531022011145e-05, "loss": 0.0702, "num_input_tokens_seen": 29779808, "step": 29590 }, { "epoch": 13.953323903818953, "grad_norm": 0.23728783428668976, "learning_rate": 1.2678580019429282e-05, "loss": 0.1733, "num_input_tokens_seen": 29785664, "step": 29595 }, { "epoch": 13.955681282413956, "grad_norm": 0.721576452255249, "learning_rate": 1.266963110265462e-05, "loss": 0.0537, "num_input_tokens_seen": 29790752, "step": 29600 }, { "epoch": 13.958038661008958, "grad_norm": 0.06714606285095215, "learning_rate": 1.2660684273202053e-05, "loss": 0.0839, "num_input_tokens_seen": 29794464, "step": 29605 }, { "epoch": 13.96039603960396, "grad_norm": 0.33942994475364685, "learning_rate": 1.265173953258613e-05, "loss": 0.1009, "num_input_tokens_seen": 29799168, "step": 29610 }, { "epoch": 13.962753418198963, "grad_norm": 0.19330646097660065, "learning_rate": 1.2642796882321039e-05, "loss": 0.0218, "num_input_tokens_seen": 29804064, "step": 29615 }, { "epoch": 13.965110796793965, "grad_norm": 0.06960681825876236, "learning_rate": 1.2633856323920626e-05, "loss": 0.0145, "num_input_tokens_seen": 29809312, "step": 29620 }, { "epoch": 13.967468175388968, "grad_norm": 1.329331636428833, "learning_rate": 1.2624917858898373e-05, "loss": 0.2384, "num_input_tokens_seen": 29813408, "step": 29625 }, { "epoch": 13.96982555398397, "grad_norm": 1.1121647357940674, "learning_rate": 1.2615981488767415e-05, "loss": 0.1421, "num_input_tokens_seen": 29819296, "step": 29630 }, { "epoch": 13.972182932578972, "grad_norm": 1.7398877143859863, "learning_rate": 1.2607047215040515e-05, "loss": 0.0852, "num_input_tokens_seen": 29824064, "step": 29635 }, { "epoch": 13.974540311173975, "grad_norm": 0.12966901063919067, "learning_rate": 1.2598115039230102e-05, "loss": 0.091, "num_input_tokens_seen": 29828832, "step": 29640 }, { "epoch": 13.976897689768977, "grad_norm": 1.0480817556381226, "learning_rate": 1.2589184962848239e-05, "loss": 0.0767, "num_input_tokens_seen": 29833280, "step": 29645 }, { "epoch": 13.97925506836398, "grad_norm": 0.8832531571388245, "learning_rate": 1.258025698740664e-05, "loss": 0.1252, "num_input_tokens_seen": 29838016, "step": 29650 }, { "epoch": 13.981612446958982, "grad_norm": 0.19311396777629852, "learning_rate": 1.2571331114416657e-05, "loss": 0.1376, "num_input_tokens_seen": 29843456, "step": 29655 }, { "epoch": 13.983969825553984, "grad_norm": 1.5040326118469238, "learning_rate": 1.2562407345389298e-05, "loss": 0.1642, "num_input_tokens_seen": 29848128, "step": 29660 }, { "epoch": 13.986327204148987, "grad_norm": 0.09410376101732254, "learning_rate": 1.255348568183519e-05, "loss": 0.0784, "num_input_tokens_seen": 29852512, "step": 29665 }, { "epoch": 13.98868458274399, "grad_norm": 0.7834752798080444, "learning_rate": 1.2544566125264623e-05, "loss": 0.0763, "num_input_tokens_seen": 29856832, "step": 29670 }, { "epoch": 13.991041961338992, "grad_norm": 0.33075544238090515, "learning_rate": 1.2535648677187534e-05, "loss": 0.0915, "num_input_tokens_seen": 29861600, "step": 29675 }, { "epoch": 13.993399339933994, "grad_norm": 0.9510080814361572, "learning_rate": 1.2526733339113498e-05, "loss": 0.0691, "num_input_tokens_seen": 29866272, "step": 29680 }, { "epoch": 13.995756718528996, "grad_norm": 0.8221756815910339, "learning_rate": 1.2517820112551714e-05, "loss": 0.0871, "num_input_tokens_seen": 29872192, "step": 29685 }, { "epoch": 13.998114097123999, "grad_norm": 0.3791232705116272, "learning_rate": 1.250890899901105e-05, "loss": 0.0417, "num_input_tokens_seen": 29877120, "step": 29690 }, { "epoch": 14.0, "eval_loss": 0.151794895529747, "eval_runtime": 15.08, "eval_samples_per_second": 62.533, "eval_steps_per_second": 15.65, "num_input_tokens_seen": 29880544, "step": 29694 }, { "epoch": 14.000471475719001, "grad_norm": 0.8711604475975037, "learning_rate": 1.2500000000000006e-05, "loss": 0.0831, "num_input_tokens_seen": 29881408, "step": 29695 }, { "epoch": 14.002828854314004, "grad_norm": 0.8437722325325012, "learning_rate": 1.2491093117026723e-05, "loss": 0.1805, "num_input_tokens_seen": 29885824, "step": 29700 }, { "epoch": 14.005186232909006, "grad_norm": 0.6449236869812012, "learning_rate": 1.248218835159899e-05, "loss": 0.1282, "num_input_tokens_seen": 29890688, "step": 29705 }, { "epoch": 14.007543611504008, "grad_norm": 0.015286078676581383, "learning_rate": 1.2473285705224225e-05, "loss": 0.0264, "num_input_tokens_seen": 29895584, "step": 29710 }, { "epoch": 14.009900990099009, "grad_norm": 1.7033352851867676, "learning_rate": 1.2464385179409499e-05, "loss": 0.1677, "num_input_tokens_seen": 29901760, "step": 29715 }, { "epoch": 14.012258368694011, "grad_norm": 0.08982601016759872, "learning_rate": 1.2455486775661529e-05, "loss": 0.0201, "num_input_tokens_seen": 29907392, "step": 29720 }, { "epoch": 14.014615747289014, "grad_norm": 1.3016575574874878, "learning_rate": 1.2446590495486645e-05, "loss": 0.1111, "num_input_tokens_seen": 29912288, "step": 29725 }, { "epoch": 14.016973125884016, "grad_norm": 0.12157317996025085, "learning_rate": 1.2437696340390844e-05, "loss": 0.0205, "num_input_tokens_seen": 29916128, "step": 29730 }, { "epoch": 14.019330504479019, "grad_norm": 0.09025556594133377, "learning_rate": 1.242880431187976e-05, "loss": 0.0846, "num_input_tokens_seen": 29920384, "step": 29735 }, { "epoch": 14.021687883074021, "grad_norm": 1.6230684518814087, "learning_rate": 1.2419914411458657e-05, "loss": 0.0733, "num_input_tokens_seen": 29925664, "step": 29740 }, { "epoch": 14.024045261669023, "grad_norm": 0.13145093619823456, "learning_rate": 1.2411026640632448e-05, "loss": 0.1449, "num_input_tokens_seen": 29932128, "step": 29745 }, { "epoch": 14.026402640264026, "grad_norm": 0.9180737137794495, "learning_rate": 1.2402141000905678e-05, "loss": 0.0979, "num_input_tokens_seen": 29936512, "step": 29750 }, { "epoch": 14.028760018859028, "grad_norm": 0.4962591528892517, "learning_rate": 1.239325749378254e-05, "loss": 0.1846, "num_input_tokens_seen": 29940992, "step": 29755 }, { "epoch": 14.03111739745403, "grad_norm": 0.8398582935333252, "learning_rate": 1.2384376120766867e-05, "loss": 0.056, "num_input_tokens_seen": 29946080, "step": 29760 }, { "epoch": 14.033474776049033, "grad_norm": 0.08501417189836502, "learning_rate": 1.2375496883362103e-05, "loss": 0.0549, "num_input_tokens_seen": 29950368, "step": 29765 }, { "epoch": 14.035832154644035, "grad_norm": 0.2738635838031769, "learning_rate": 1.2366619783071368e-05, "loss": 0.0519, "num_input_tokens_seen": 29955136, "step": 29770 }, { "epoch": 14.038189533239038, "grad_norm": 0.07473307102918625, "learning_rate": 1.23577448213974e-05, "loss": 0.036, "num_input_tokens_seen": 29960864, "step": 29775 }, { "epoch": 14.04054691183404, "grad_norm": 0.05340682342648506, "learning_rate": 1.2348871999842579e-05, "loss": 0.057, "num_input_tokens_seen": 29965600, "step": 29780 }, { "epoch": 14.042904290429043, "grad_norm": 0.1566576510667801, "learning_rate": 1.2340001319908925e-05, "loss": 0.2013, "num_input_tokens_seen": 29969600, "step": 29785 }, { "epoch": 14.045261669024045, "grad_norm": 1.7864705324172974, "learning_rate": 1.2331132783098096e-05, "loss": 0.106, "num_input_tokens_seen": 29973888, "step": 29790 }, { "epoch": 14.047619047619047, "grad_norm": 0.45101261138916016, "learning_rate": 1.2322266390911378e-05, "loss": 0.2196, "num_input_tokens_seen": 29979040, "step": 29795 }, { "epoch": 14.04997642621405, "grad_norm": 0.4274483919143677, "learning_rate": 1.2313402144849714e-05, "loss": 0.2336, "num_input_tokens_seen": 29984256, "step": 29800 }, { "epoch": 14.052333804809052, "grad_norm": 1.356960654258728, "learning_rate": 1.2304540046413655e-05, "loss": 0.128, "num_input_tokens_seen": 29989152, "step": 29805 }, { "epoch": 14.054691183404055, "grad_norm": 0.8192353844642639, "learning_rate": 1.2295680097103407e-05, "loss": 0.1009, "num_input_tokens_seen": 29994464, "step": 29810 }, { "epoch": 14.057048561999057, "grad_norm": 1.9401350021362305, "learning_rate": 1.228682229841881e-05, "loss": 0.1376, "num_input_tokens_seen": 29998848, "step": 29815 }, { "epoch": 14.05940594059406, "grad_norm": 3.0766961574554443, "learning_rate": 1.2277966651859343e-05, "loss": 0.2694, "num_input_tokens_seen": 30003520, "step": 29820 }, { "epoch": 14.061763319189062, "grad_norm": 0.0870598554611206, "learning_rate": 1.2269113158924111e-05, "loss": 0.1541, "num_input_tokens_seen": 30008320, "step": 29825 }, { "epoch": 14.064120697784064, "grad_norm": 0.0034142460208386183, "learning_rate": 1.2260261821111866e-05, "loss": 0.0851, "num_input_tokens_seen": 30013344, "step": 29830 }, { "epoch": 14.066478076379067, "grad_norm": 0.4693237245082855, "learning_rate": 1.2251412639920986e-05, "loss": 0.1372, "num_input_tokens_seen": 30018592, "step": 29835 }, { "epoch": 14.068835454974069, "grad_norm": 0.060741107910871506, "learning_rate": 1.2242565616849497e-05, "loss": 0.078, "num_input_tokens_seen": 30023616, "step": 29840 }, { "epoch": 14.071192833569071, "grad_norm": 2.837411642074585, "learning_rate": 1.2233720753395028e-05, "loss": 0.1704, "num_input_tokens_seen": 30028576, "step": 29845 }, { "epoch": 14.073550212164074, "grad_norm": 0.6709186434745789, "learning_rate": 1.2224878051054877e-05, "loss": 0.0521, "num_input_tokens_seen": 30033728, "step": 29850 }, { "epoch": 14.075907590759076, "grad_norm": 0.2459692806005478, "learning_rate": 1.2216037511325968e-05, "loss": 0.035, "num_input_tokens_seen": 30038368, "step": 29855 }, { "epoch": 14.078264969354079, "grad_norm": 0.5748240351676941, "learning_rate": 1.2207199135704842e-05, "loss": 0.1451, "num_input_tokens_seen": 30042688, "step": 29860 }, { "epoch": 14.080622347949081, "grad_norm": 1.3541202545166016, "learning_rate": 1.2198362925687698e-05, "loss": 0.234, "num_input_tokens_seen": 30047296, "step": 29865 }, { "epoch": 14.082979726544083, "grad_norm": 0.14178499579429626, "learning_rate": 1.2189528882770349e-05, "loss": 0.2409, "num_input_tokens_seen": 30051904, "step": 29870 }, { "epoch": 14.085337105139086, "grad_norm": 0.07883158326148987, "learning_rate": 1.218069700844825e-05, "loss": 0.1998, "num_input_tokens_seen": 30057088, "step": 29875 }, { "epoch": 14.087694483734088, "grad_norm": 0.28502634167671204, "learning_rate": 1.2171867304216497e-05, "loss": 0.0981, "num_input_tokens_seen": 30061504, "step": 29880 }, { "epoch": 14.09005186232909, "grad_norm": 0.1905713975429535, "learning_rate": 1.2163039771569793e-05, "loss": 0.2414, "num_input_tokens_seen": 30066528, "step": 29885 }, { "epoch": 14.092409240924093, "grad_norm": 0.035549771040678024, "learning_rate": 1.2154214412002504e-05, "loss": 0.0418, "num_input_tokens_seen": 30071360, "step": 29890 }, { "epoch": 14.094766619519095, "grad_norm": 0.5243905186653137, "learning_rate": 1.2145391227008598e-05, "loss": 0.0376, "num_input_tokens_seen": 30077536, "step": 29895 }, { "epoch": 14.097123998114098, "grad_norm": 2.2235612869262695, "learning_rate": 1.21365702180817e-05, "loss": 0.0888, "num_input_tokens_seen": 30082176, "step": 29900 }, { "epoch": 14.0994813767091, "grad_norm": 0.2706591486930847, "learning_rate": 1.2127751386715052e-05, "loss": 0.121, "num_input_tokens_seen": 30086912, "step": 29905 }, { "epoch": 14.101838755304103, "grad_norm": 0.03073797933757305, "learning_rate": 1.2118934734401538e-05, "loss": 0.1322, "num_input_tokens_seen": 30091328, "step": 29910 }, { "epoch": 14.104196133899103, "grad_norm": 0.07605541497468948, "learning_rate": 1.2110120262633664e-05, "loss": 0.1451, "num_input_tokens_seen": 30096000, "step": 29915 }, { "epoch": 14.106553512494106, "grad_norm": 0.06732278317213058, "learning_rate": 1.2101307972903573e-05, "loss": 0.1027, "num_input_tokens_seen": 30101952, "step": 29920 }, { "epoch": 14.108910891089108, "grad_norm": 0.1865377426147461, "learning_rate": 1.2092497866703034e-05, "loss": 0.038, "num_input_tokens_seen": 30106912, "step": 29925 }, { "epoch": 14.11126826968411, "grad_norm": 3.2674708366394043, "learning_rate": 1.208368994552346e-05, "loss": 0.2499, "num_input_tokens_seen": 30114144, "step": 29930 }, { "epoch": 14.113625648279113, "grad_norm": 0.30695557594299316, "learning_rate": 1.207488421085586e-05, "loss": 0.0861, "num_input_tokens_seen": 30118976, "step": 29935 }, { "epoch": 14.115983026874115, "grad_norm": 0.09481994062662125, "learning_rate": 1.2066080664190909e-05, "loss": 0.0685, "num_input_tokens_seen": 30124608, "step": 29940 }, { "epoch": 14.118340405469118, "grad_norm": 0.9034886360168457, "learning_rate": 1.2057279307018896e-05, "loss": 0.157, "num_input_tokens_seen": 30130080, "step": 29945 }, { "epoch": 14.12069778406412, "grad_norm": 0.09401355683803558, "learning_rate": 1.2048480140829741e-05, "loss": 0.1247, "num_input_tokens_seen": 30135648, "step": 29950 }, { "epoch": 14.123055162659123, "grad_norm": 1.1342921257019043, "learning_rate": 1.2039683167112995e-05, "loss": 0.1632, "num_input_tokens_seen": 30141376, "step": 29955 }, { "epoch": 14.125412541254125, "grad_norm": 0.24471916258335114, "learning_rate": 1.2030888387357834e-05, "loss": 0.0311, "num_input_tokens_seen": 30146656, "step": 29960 }, { "epoch": 14.127769919849127, "grad_norm": 1.0117143392562866, "learning_rate": 1.2022095803053066e-05, "loss": 0.168, "num_input_tokens_seen": 30152224, "step": 29965 }, { "epoch": 14.13012729844413, "grad_norm": 0.18527834117412567, "learning_rate": 1.2013305415687137e-05, "loss": 0.1915, "num_input_tokens_seen": 30157696, "step": 29970 }, { "epoch": 14.132484677039132, "grad_norm": 1.1201958656311035, "learning_rate": 1.2004517226748089e-05, "loss": 0.0703, "num_input_tokens_seen": 30162368, "step": 29975 }, { "epoch": 14.134842055634135, "grad_norm": 0.08849647641181946, "learning_rate": 1.1995731237723623e-05, "loss": 0.2457, "num_input_tokens_seen": 30168320, "step": 29980 }, { "epoch": 14.137199434229137, "grad_norm": 0.3956175148487091, "learning_rate": 1.198694745010106e-05, "loss": 0.0392, "num_input_tokens_seen": 30172736, "step": 29985 }, { "epoch": 14.13955681282414, "grad_norm": 1.1337642669677734, "learning_rate": 1.1978165865367344e-05, "loss": 0.1466, "num_input_tokens_seen": 30176832, "step": 29990 }, { "epoch": 14.141914191419142, "grad_norm": 1.6340454816818237, "learning_rate": 1.1969386485009049e-05, "loss": 0.1261, "num_input_tokens_seen": 30181280, "step": 29995 }, { "epoch": 14.144271570014144, "grad_norm": 0.4119277000427246, "learning_rate": 1.1960609310512375e-05, "loss": 0.2021, "num_input_tokens_seen": 30186624, "step": 30000 }, { "epoch": 14.146628948609147, "grad_norm": 1.1529968976974487, "learning_rate": 1.1951834343363146e-05, "loss": 0.1355, "num_input_tokens_seen": 30191584, "step": 30005 }, { "epoch": 14.148986327204149, "grad_norm": 2.3258655071258545, "learning_rate": 1.194306158504683e-05, "loss": 0.164, "num_input_tokens_seen": 30197024, "step": 30010 }, { "epoch": 14.151343705799151, "grad_norm": 0.4520375728607178, "learning_rate": 1.1934291037048483e-05, "loss": 0.1212, "num_input_tokens_seen": 30202208, "step": 30015 }, { "epoch": 14.153701084394154, "grad_norm": 0.2811161279678345, "learning_rate": 1.1925522700852821e-05, "loss": 0.0263, "num_input_tokens_seen": 30207232, "step": 30020 }, { "epoch": 14.156058462989156, "grad_norm": 0.6310303807258606, "learning_rate": 1.1916756577944174e-05, "loss": 0.1291, "num_input_tokens_seen": 30212352, "step": 30025 }, { "epoch": 14.158415841584159, "grad_norm": 1.2881149053573608, "learning_rate": 1.1907992669806495e-05, "loss": 0.1907, "num_input_tokens_seen": 30217280, "step": 30030 }, { "epoch": 14.160773220179161, "grad_norm": 0.8459639549255371, "learning_rate": 1.189923097792337e-05, "loss": 0.0587, "num_input_tokens_seen": 30221568, "step": 30035 }, { "epoch": 14.163130598774163, "grad_norm": 0.13607262074947357, "learning_rate": 1.1890471503778e-05, "loss": 0.0231, "num_input_tokens_seen": 30226432, "step": 30040 }, { "epoch": 14.165487977369166, "grad_norm": 0.1791595220565796, "learning_rate": 1.1881714248853218e-05, "loss": 0.0604, "num_input_tokens_seen": 30230816, "step": 30045 }, { "epoch": 14.167845355964168, "grad_norm": 1.284388542175293, "learning_rate": 1.1872959214631487e-05, "loss": 0.1564, "num_input_tokens_seen": 30237088, "step": 30050 }, { "epoch": 14.17020273455917, "grad_norm": 1.1005734205245972, "learning_rate": 1.1864206402594869e-05, "loss": 0.1248, "num_input_tokens_seen": 30241888, "step": 30055 }, { "epoch": 14.172560113154173, "grad_norm": 0.4592437446117401, "learning_rate": 1.1855455814225071e-05, "loss": 0.1369, "num_input_tokens_seen": 30248192, "step": 30060 }, { "epoch": 14.174917491749175, "grad_norm": 0.25567635893821716, "learning_rate": 1.1846707451003425e-05, "loss": 0.0328, "num_input_tokens_seen": 30253056, "step": 30065 }, { "epoch": 14.177274870344178, "grad_norm": 1.1206600666046143, "learning_rate": 1.1837961314410879e-05, "loss": 0.0308, "num_input_tokens_seen": 30257056, "step": 30070 }, { "epoch": 14.17963224893918, "grad_norm": 1.8075803518295288, "learning_rate": 1.1829217405928e-05, "loss": 0.0948, "num_input_tokens_seen": 30262624, "step": 30075 }, { "epoch": 14.181989627534183, "grad_norm": 0.01796480268239975, "learning_rate": 1.1820475727034989e-05, "loss": 0.0405, "num_input_tokens_seen": 30267232, "step": 30080 }, { "epoch": 14.184347006129185, "grad_norm": 0.7236055731773376, "learning_rate": 1.1811736279211657e-05, "loss": 0.0701, "num_input_tokens_seen": 30273088, "step": 30085 }, { "epoch": 14.186704384724187, "grad_norm": 0.3812538981437683, "learning_rate": 1.1802999063937465e-05, "loss": 0.0289, "num_input_tokens_seen": 30278624, "step": 30090 }, { "epoch": 14.18906176331919, "grad_norm": 1.8417303562164307, "learning_rate": 1.1794264082691444e-05, "loss": 0.1846, "num_input_tokens_seen": 30284544, "step": 30095 }, { "epoch": 14.191419141914192, "grad_norm": 0.7550889253616333, "learning_rate": 1.1785531336952296e-05, "loss": 0.0348, "num_input_tokens_seen": 30291488, "step": 30100 }, { "epoch": 14.193776520509195, "grad_norm": 0.1844474971294403, "learning_rate": 1.1776800828198325e-05, "loss": 0.0125, "num_input_tokens_seen": 30296000, "step": 30105 }, { "epoch": 14.196133899104197, "grad_norm": 0.21373534202575684, "learning_rate": 1.1768072557907465e-05, "loss": 0.2326, "num_input_tokens_seen": 30301120, "step": 30110 }, { "epoch": 14.198491277699198, "grad_norm": 0.15017399191856384, "learning_rate": 1.1759346527557246e-05, "loss": 0.0747, "num_input_tokens_seen": 30306432, "step": 30115 }, { "epoch": 14.2008486562942, "grad_norm": 0.20713886618614197, "learning_rate": 1.1750622738624847e-05, "loss": 0.0657, "num_input_tokens_seen": 30312352, "step": 30120 }, { "epoch": 14.203206034889202, "grad_norm": 0.4082963764667511, "learning_rate": 1.1741901192587053e-05, "loss": 0.0449, "num_input_tokens_seen": 30317120, "step": 30125 }, { "epoch": 14.205563413484205, "grad_norm": 0.8942925930023193, "learning_rate": 1.1733181890920283e-05, "loss": 0.0955, "num_input_tokens_seen": 30322112, "step": 30130 }, { "epoch": 14.207920792079207, "grad_norm": 0.7559837102890015, "learning_rate": 1.172446483510056e-05, "loss": 0.094, "num_input_tokens_seen": 30328064, "step": 30135 }, { "epoch": 14.21027817067421, "grad_norm": 1.02833890914917, "learning_rate": 1.1715750026603542e-05, "loss": 0.1298, "num_input_tokens_seen": 30333632, "step": 30140 }, { "epoch": 14.212635549269212, "grad_norm": 0.3006826341152191, "learning_rate": 1.170703746690448e-05, "loss": 0.0494, "num_input_tokens_seen": 30338400, "step": 30145 }, { "epoch": 14.214992927864214, "grad_norm": 0.004972428549081087, "learning_rate": 1.1698327157478275e-05, "loss": 0.2485, "num_input_tokens_seen": 30343296, "step": 30150 }, { "epoch": 14.217350306459217, "grad_norm": 0.42553791403770447, "learning_rate": 1.1689619099799434e-05, "loss": 0.1981, "num_input_tokens_seen": 30348320, "step": 30155 }, { "epoch": 14.21970768505422, "grad_norm": 0.2401614636182785, "learning_rate": 1.168091329534208e-05, "loss": 0.1481, "num_input_tokens_seen": 30353344, "step": 30160 }, { "epoch": 14.222065063649222, "grad_norm": 0.06756847351789474, "learning_rate": 1.1672209745579962e-05, "loss": 0.2126, "num_input_tokens_seen": 30359040, "step": 30165 }, { "epoch": 14.224422442244224, "grad_norm": 0.3188093900680542, "learning_rate": 1.1663508451986442e-05, "loss": 0.107, "num_input_tokens_seen": 30363840, "step": 30170 }, { "epoch": 14.226779820839226, "grad_norm": 3.1488232612609863, "learning_rate": 1.1654809416034496e-05, "loss": 0.0672, "num_input_tokens_seen": 30368832, "step": 30175 }, { "epoch": 14.229137199434229, "grad_norm": 0.1850002110004425, "learning_rate": 1.1646112639196739e-05, "loss": 0.0629, "num_input_tokens_seen": 30372896, "step": 30180 }, { "epoch": 14.231494578029231, "grad_norm": 0.02824614942073822, "learning_rate": 1.1637418122945368e-05, "loss": 0.2569, "num_input_tokens_seen": 30377696, "step": 30185 }, { "epoch": 14.233851956624234, "grad_norm": 1.234604835510254, "learning_rate": 1.1628725868752222e-05, "loss": 0.0998, "num_input_tokens_seen": 30383584, "step": 30190 }, { "epoch": 14.236209335219236, "grad_norm": 0.09856494516134262, "learning_rate": 1.162003587808876e-05, "loss": 0.1648, "num_input_tokens_seen": 30388640, "step": 30195 }, { "epoch": 14.238566713814238, "grad_norm": 0.13484644889831543, "learning_rate": 1.161134815242604e-05, "loss": 0.0298, "num_input_tokens_seen": 30395712, "step": 30200 }, { "epoch": 14.24092409240924, "grad_norm": 0.29405707120895386, "learning_rate": 1.1602662693234753e-05, "loss": 0.1535, "num_input_tokens_seen": 30400768, "step": 30205 }, { "epoch": 14.243281471004243, "grad_norm": 0.8164129853248596, "learning_rate": 1.1593979501985198e-05, "loss": 0.0379, "num_input_tokens_seen": 30405696, "step": 30210 }, { "epoch": 14.245638849599246, "grad_norm": 0.08510389924049377, "learning_rate": 1.1585298580147294e-05, "loss": 0.011, "num_input_tokens_seen": 30410144, "step": 30215 }, { "epoch": 14.247996228194248, "grad_norm": 0.5347925424575806, "learning_rate": 1.157661992919058e-05, "loss": 0.0686, "num_input_tokens_seen": 30415200, "step": 30220 }, { "epoch": 14.25035360678925, "grad_norm": 2.581472635269165, "learning_rate": 1.1567943550584185e-05, "loss": 0.1959, "num_input_tokens_seen": 30420736, "step": 30225 }, { "epoch": 14.252710985384253, "grad_norm": 1.5847587585449219, "learning_rate": 1.1559269445796886e-05, "loss": 0.1548, "num_input_tokens_seen": 30424832, "step": 30230 }, { "epoch": 14.255068363979255, "grad_norm": 0.02351069264113903, "learning_rate": 1.1550597616297057e-05, "loss": 0.0325, "num_input_tokens_seen": 30429312, "step": 30235 }, { "epoch": 14.257425742574258, "grad_norm": 2.06054949760437, "learning_rate": 1.1541928063552696e-05, "loss": 0.106, "num_input_tokens_seen": 30434560, "step": 30240 }, { "epoch": 14.25978312116926, "grad_norm": 0.3190666735172272, "learning_rate": 1.1533260789031405e-05, "loss": 0.0652, "num_input_tokens_seen": 30442432, "step": 30245 }, { "epoch": 14.262140499764262, "grad_norm": 0.43609848618507385, "learning_rate": 1.1524595794200413e-05, "loss": 0.0227, "num_input_tokens_seen": 30446784, "step": 30250 }, { "epoch": 14.264497878359265, "grad_norm": 0.43535086512565613, "learning_rate": 1.1515933080526553e-05, "loss": 0.0543, "num_input_tokens_seen": 30452224, "step": 30255 }, { "epoch": 14.266855256954267, "grad_norm": 0.5250294804573059, "learning_rate": 1.1507272649476286e-05, "loss": 0.1468, "num_input_tokens_seen": 30456864, "step": 30260 }, { "epoch": 14.26921263554927, "grad_norm": 1.2719954252243042, "learning_rate": 1.1498614502515656e-05, "loss": 0.2481, "num_input_tokens_seen": 30462336, "step": 30265 }, { "epoch": 14.271570014144272, "grad_norm": 0.853724479675293, "learning_rate": 1.1489958641110351e-05, "loss": 0.1431, "num_input_tokens_seen": 30467104, "step": 30270 }, { "epoch": 14.273927392739274, "grad_norm": 0.811379075050354, "learning_rate": 1.1481305066725662e-05, "loss": 0.1356, "num_input_tokens_seen": 30472736, "step": 30275 }, { "epoch": 14.276284771334277, "grad_norm": 0.8172282576560974, "learning_rate": 1.1472653780826486e-05, "loss": 0.0997, "num_input_tokens_seen": 30478272, "step": 30280 }, { "epoch": 14.27864214992928, "grad_norm": 1.1414939165115356, "learning_rate": 1.146400478487735e-05, "loss": 0.0609, "num_input_tokens_seen": 30483584, "step": 30285 }, { "epoch": 14.280999528524282, "grad_norm": 1.0073033571243286, "learning_rate": 1.1455358080342372e-05, "loss": 0.1006, "num_input_tokens_seen": 30489888, "step": 30290 }, { "epoch": 14.283356907119284, "grad_norm": 0.21730071306228638, "learning_rate": 1.1446713668685297e-05, "loss": 0.0537, "num_input_tokens_seen": 30494080, "step": 30295 }, { "epoch": 14.285714285714286, "grad_norm": 0.08457253873348236, "learning_rate": 1.1438071551369484e-05, "loss": 0.113, "num_input_tokens_seen": 30498880, "step": 30300 }, { "epoch": 14.288071664309289, "grad_norm": 0.07319167256355286, "learning_rate": 1.1429431729857881e-05, "loss": 0.0185, "num_input_tokens_seen": 30504288, "step": 30305 }, { "epoch": 14.290429042904291, "grad_norm": 1.2747952938079834, "learning_rate": 1.142079420561307e-05, "loss": 0.1094, "num_input_tokens_seen": 30509248, "step": 30310 }, { "epoch": 14.292786421499294, "grad_norm": 0.41510823369026184, "learning_rate": 1.141215898009724e-05, "loss": 0.0636, "num_input_tokens_seen": 30513984, "step": 30315 }, { "epoch": 14.295143800094294, "grad_norm": 0.2616771161556244, "learning_rate": 1.1403526054772184e-05, "loss": 0.0306, "num_input_tokens_seen": 30518848, "step": 30320 }, { "epoch": 14.297501178689297, "grad_norm": 0.20632721483707428, "learning_rate": 1.1394895431099314e-05, "loss": 0.2146, "num_input_tokens_seen": 30523296, "step": 30325 }, { "epoch": 14.299858557284299, "grad_norm": 0.29104793071746826, "learning_rate": 1.1386267110539644e-05, "loss": 0.1051, "num_input_tokens_seen": 30528864, "step": 30330 }, { "epoch": 14.302215935879302, "grad_norm": 2.124500274658203, "learning_rate": 1.1377641094553815e-05, "loss": 0.1974, "num_input_tokens_seen": 30534400, "step": 30335 }, { "epoch": 14.304573314474304, "grad_norm": 0.9644743204116821, "learning_rate": 1.1369017384602046e-05, "loss": 0.1009, "num_input_tokens_seen": 30539712, "step": 30340 }, { "epoch": 14.306930693069306, "grad_norm": 0.14883136749267578, "learning_rate": 1.1360395982144201e-05, "loss": 0.0435, "num_input_tokens_seen": 30544544, "step": 30345 }, { "epoch": 14.309288071664309, "grad_norm": 0.12986691296100616, "learning_rate": 1.1351776888639717e-05, "loss": 0.0235, "num_input_tokens_seen": 30549088, "step": 30350 }, { "epoch": 14.311645450259311, "grad_norm": 1.5250154733657837, "learning_rate": 1.1343160105547677e-05, "loss": 0.2816, "num_input_tokens_seen": 30554272, "step": 30355 }, { "epoch": 14.314002828854314, "grad_norm": 0.21305739879608154, "learning_rate": 1.1334545634326746e-05, "loss": 0.1599, "num_input_tokens_seen": 30559424, "step": 30360 }, { "epoch": 14.316360207449316, "grad_norm": 0.9645308256149292, "learning_rate": 1.1325933476435216e-05, "loss": 0.0439, "num_input_tokens_seen": 30564352, "step": 30365 }, { "epoch": 14.318717586044318, "grad_norm": 0.024708056822419167, "learning_rate": 1.1317323633330975e-05, "loss": 0.123, "num_input_tokens_seen": 30568800, "step": 30370 }, { "epoch": 14.32107496463932, "grad_norm": 1.1893415451049805, "learning_rate": 1.1308716106471526e-05, "loss": 0.0789, "num_input_tokens_seen": 30573024, "step": 30375 }, { "epoch": 14.323432343234323, "grad_norm": 1.497166395187378, "learning_rate": 1.1300110897313973e-05, "loss": 0.1532, "num_input_tokens_seen": 30577760, "step": 30380 }, { "epoch": 14.325789721829326, "grad_norm": 0.15063998103141785, "learning_rate": 1.1291508007315046e-05, "loss": 0.1607, "num_input_tokens_seen": 30582880, "step": 30385 }, { "epoch": 14.328147100424328, "grad_norm": 0.014131059870123863, "learning_rate": 1.1282907437931046e-05, "loss": 0.1384, "num_input_tokens_seen": 30587264, "step": 30390 }, { "epoch": 14.33050447901933, "grad_norm": 0.18879742920398712, "learning_rate": 1.1274309190617913e-05, "loss": 0.196, "num_input_tokens_seen": 30590944, "step": 30395 }, { "epoch": 14.332861857614333, "grad_norm": 0.820427656173706, "learning_rate": 1.1265713266831184e-05, "loss": 0.1687, "num_input_tokens_seen": 30595968, "step": 30400 }, { "epoch": 14.335219236209335, "grad_norm": 0.4678191542625427, "learning_rate": 1.1257119668026006e-05, "loss": 0.0709, "num_input_tokens_seen": 30600448, "step": 30405 }, { "epoch": 14.337576614804338, "grad_norm": 0.057602111250162125, "learning_rate": 1.1248528395657129e-05, "loss": 0.0765, "num_input_tokens_seen": 30607968, "step": 30410 }, { "epoch": 14.33993399339934, "grad_norm": 0.4979265630245209, "learning_rate": 1.1239939451178905e-05, "loss": 0.1108, "num_input_tokens_seen": 30612000, "step": 30415 }, { "epoch": 14.342291371994342, "grad_norm": 0.03871415555477142, "learning_rate": 1.12313528360453e-05, "loss": 0.1293, "num_input_tokens_seen": 30616704, "step": 30420 }, { "epoch": 14.344648750589345, "grad_norm": 0.5103113055229187, "learning_rate": 1.1222768551709892e-05, "loss": 0.0294, "num_input_tokens_seen": 30621536, "step": 30425 }, { "epoch": 14.347006129184347, "grad_norm": 2.9974048137664795, "learning_rate": 1.1214186599625833e-05, "loss": 0.1025, "num_input_tokens_seen": 30626112, "step": 30430 }, { "epoch": 14.34936350777935, "grad_norm": 0.31376534700393677, "learning_rate": 1.1205606981245914e-05, "loss": 0.1094, "num_input_tokens_seen": 30632224, "step": 30435 }, { "epoch": 14.351720886374352, "grad_norm": 0.15822678804397583, "learning_rate": 1.1197029698022519e-05, "loss": 0.0739, "num_input_tokens_seen": 30637824, "step": 30440 }, { "epoch": 14.354078264969354, "grad_norm": 0.36507919430732727, "learning_rate": 1.118845475140763e-05, "loss": 0.1691, "num_input_tokens_seen": 30642496, "step": 30445 }, { "epoch": 14.356435643564357, "grad_norm": 0.3467041850090027, "learning_rate": 1.1179882142852852e-05, "loss": 0.0947, "num_input_tokens_seen": 30647744, "step": 30450 }, { "epoch": 14.35879302215936, "grad_norm": 0.6888874769210815, "learning_rate": 1.117131187380937e-05, "loss": 0.1077, "num_input_tokens_seen": 30652576, "step": 30455 }, { "epoch": 14.361150400754362, "grad_norm": 1.2767269611358643, "learning_rate": 1.1162743945727991e-05, "loss": 0.1644, "num_input_tokens_seen": 30657760, "step": 30460 }, { "epoch": 14.363507779349364, "grad_norm": 0.30349570512771606, "learning_rate": 1.1154178360059129e-05, "loss": 0.0298, "num_input_tokens_seen": 30662976, "step": 30465 }, { "epoch": 14.365865157944366, "grad_norm": 0.008183132857084274, "learning_rate": 1.1145615118252772e-05, "loss": 0.0172, "num_input_tokens_seen": 30667200, "step": 30470 }, { "epoch": 14.368222536539369, "grad_norm": 2.0601112842559814, "learning_rate": 1.1137054221758541e-05, "loss": 0.1901, "num_input_tokens_seen": 30671872, "step": 30475 }, { "epoch": 14.370579915134371, "grad_norm": 0.15083514153957367, "learning_rate": 1.1128495672025652e-05, "loss": 0.2697, "num_input_tokens_seen": 30676480, "step": 30480 }, { "epoch": 14.372937293729374, "grad_norm": 1.7263669967651367, "learning_rate": 1.1119939470502921e-05, "loss": 0.1233, "num_input_tokens_seen": 30683456, "step": 30485 }, { "epoch": 14.375294672324376, "grad_norm": 0.42515841126441956, "learning_rate": 1.1111385618638767e-05, "loss": 0.1164, "num_input_tokens_seen": 30687680, "step": 30490 }, { "epoch": 14.377652050919378, "grad_norm": 0.7276237607002258, "learning_rate": 1.110283411788121e-05, "loss": 0.0382, "num_input_tokens_seen": 30692736, "step": 30495 }, { "epoch": 14.38000942951438, "grad_norm": 0.15243136882781982, "learning_rate": 1.1094284969677881e-05, "loss": 0.077, "num_input_tokens_seen": 30696960, "step": 30500 }, { "epoch": 14.382366808109383, "grad_norm": 0.08969037234783173, "learning_rate": 1.1085738175476008e-05, "loss": 0.1763, "num_input_tokens_seen": 30701024, "step": 30505 }, { "epoch": 14.384724186704386, "grad_norm": 2.639019012451172, "learning_rate": 1.1077193736722402e-05, "loss": 0.2376, "num_input_tokens_seen": 30705440, "step": 30510 }, { "epoch": 14.387081565299386, "grad_norm": 0.23139770328998566, "learning_rate": 1.1068651654863502e-05, "loss": 0.014, "num_input_tokens_seen": 30709888, "step": 30515 }, { "epoch": 14.389438943894389, "grad_norm": 0.04733993485569954, "learning_rate": 1.1060111931345336e-05, "loss": 0.045, "num_input_tokens_seen": 30714848, "step": 30520 }, { "epoch": 14.391796322489391, "grad_norm": 0.292928010225296, "learning_rate": 1.1051574567613532e-05, "loss": 0.1263, "num_input_tokens_seen": 30720160, "step": 30525 }, { "epoch": 14.394153701084393, "grad_norm": 1.4079147577285767, "learning_rate": 1.1043039565113322e-05, "loss": 0.1158, "num_input_tokens_seen": 30724800, "step": 30530 }, { "epoch": 14.396511079679396, "grad_norm": 0.4111202657222748, "learning_rate": 1.103450692528954e-05, "loss": 0.1032, "num_input_tokens_seen": 30730528, "step": 30535 }, { "epoch": 14.398868458274398, "grad_norm": 0.291329950094223, "learning_rate": 1.1025976649586615e-05, "loss": 0.1022, "num_input_tokens_seen": 30735808, "step": 30540 }, { "epoch": 14.4012258368694, "grad_norm": 0.26886627078056335, "learning_rate": 1.1017448739448586e-05, "loss": 0.0805, "num_input_tokens_seen": 30740928, "step": 30545 }, { "epoch": 14.403583215464403, "grad_norm": 0.5343343019485474, "learning_rate": 1.1008923196319068e-05, "loss": 0.1939, "num_input_tokens_seen": 30746592, "step": 30550 }, { "epoch": 14.405940594059405, "grad_norm": 2.320854425430298, "learning_rate": 1.1000400021641296e-05, "loss": 0.1202, "num_input_tokens_seen": 30752032, "step": 30555 }, { "epoch": 14.408297972654408, "grad_norm": 0.08059409260749817, "learning_rate": 1.0991879216858112e-05, "loss": 0.1418, "num_input_tokens_seen": 30756448, "step": 30560 }, { "epoch": 14.41065535124941, "grad_norm": 0.1668047159910202, "learning_rate": 1.098336078341192e-05, "loss": 0.0607, "num_input_tokens_seen": 30760544, "step": 30565 }, { "epoch": 14.413012729844413, "grad_norm": 0.2819424271583557, "learning_rate": 1.097484472274476e-05, "loss": 0.0604, "num_input_tokens_seen": 30766112, "step": 30570 }, { "epoch": 14.415370108439415, "grad_norm": 0.1248236820101738, "learning_rate": 1.0966331036298255e-05, "loss": 0.1652, "num_input_tokens_seen": 30771808, "step": 30575 }, { "epoch": 14.417727487034417, "grad_norm": 1.6460942029953003, "learning_rate": 1.0957819725513627e-05, "loss": 0.1516, "num_input_tokens_seen": 30776864, "step": 30580 }, { "epoch": 14.42008486562942, "grad_norm": 1.2169421911239624, "learning_rate": 1.0949310791831696e-05, "loss": 0.0613, "num_input_tokens_seen": 30781568, "step": 30585 }, { "epoch": 14.422442244224422, "grad_norm": 1.8405778408050537, "learning_rate": 1.0940804236692884e-05, "loss": 0.1496, "num_input_tokens_seen": 30786688, "step": 30590 }, { "epoch": 14.424799622819425, "grad_norm": 0.3793698251247406, "learning_rate": 1.093230006153721e-05, "loss": 0.0509, "num_input_tokens_seen": 30790784, "step": 30595 }, { "epoch": 14.427157001414427, "grad_norm": 1.1379036903381348, "learning_rate": 1.0923798267804269e-05, "loss": 0.1967, "num_input_tokens_seen": 30796160, "step": 30600 }, { "epoch": 14.42951438000943, "grad_norm": 0.4474835693836212, "learning_rate": 1.0915298856933285e-05, "loss": 0.0661, "num_input_tokens_seen": 30801504, "step": 30605 }, { "epoch": 14.431871758604432, "grad_norm": 0.6425930857658386, "learning_rate": 1.0906801830363056e-05, "loss": 0.0661, "num_input_tokens_seen": 30805696, "step": 30610 }, { "epoch": 14.434229137199434, "grad_norm": 1.1084312200546265, "learning_rate": 1.089830718953199e-05, "loss": 0.0614, "num_input_tokens_seen": 30811200, "step": 30615 }, { "epoch": 14.436586515794437, "grad_norm": 0.8070443868637085, "learning_rate": 1.0889814935878084e-05, "loss": 0.227, "num_input_tokens_seen": 30815904, "step": 30620 }, { "epoch": 14.438943894389439, "grad_norm": 0.8255713582038879, "learning_rate": 1.088132507083893e-05, "loss": 0.0463, "num_input_tokens_seen": 30820544, "step": 30625 }, { "epoch": 14.441301272984441, "grad_norm": 0.5746713280677795, "learning_rate": 1.0872837595851723e-05, "loss": 0.2096, "num_input_tokens_seen": 30825152, "step": 30630 }, { "epoch": 14.443658651579444, "grad_norm": 0.17948786914348602, "learning_rate": 1.0864352512353251e-05, "loss": 0.0939, "num_input_tokens_seen": 30831072, "step": 30635 }, { "epoch": 14.446016030174446, "grad_norm": 0.59732586145401, "learning_rate": 1.0855869821779882e-05, "loss": 0.0866, "num_input_tokens_seen": 30835200, "step": 30640 }, { "epoch": 14.448373408769449, "grad_norm": 0.5172486901283264, "learning_rate": 1.08473895255676e-05, "loss": 0.1273, "num_input_tokens_seen": 30839584, "step": 30645 }, { "epoch": 14.450730787364451, "grad_norm": 0.21500687301158905, "learning_rate": 1.0838911625151973e-05, "loss": 0.1128, "num_input_tokens_seen": 30845056, "step": 30650 }, { "epoch": 14.453088165959453, "grad_norm": 2.29553484916687, "learning_rate": 1.0830436121968166e-05, "loss": 0.147, "num_input_tokens_seen": 30850976, "step": 30655 }, { "epoch": 14.455445544554456, "grad_norm": 0.05181395262479782, "learning_rate": 1.0821963017450937e-05, "loss": 0.2309, "num_input_tokens_seen": 30855136, "step": 30660 }, { "epoch": 14.457802923149458, "grad_norm": 0.04989982023835182, "learning_rate": 1.081349231303464e-05, "loss": 0.0405, "num_input_tokens_seen": 30859712, "step": 30665 }, { "epoch": 14.46016030174446, "grad_norm": 0.016048137098550797, "learning_rate": 1.080502401015322e-05, "loss": 0.1468, "num_input_tokens_seen": 30863936, "step": 30670 }, { "epoch": 14.462517680339463, "grad_norm": 0.2440868467092514, "learning_rate": 1.079655811024023e-05, "loss": 0.0528, "num_input_tokens_seen": 30869216, "step": 30675 }, { "epoch": 14.464875058934465, "grad_norm": 1.546670913696289, "learning_rate": 1.078809461472878e-05, "loss": 0.1693, "num_input_tokens_seen": 30874080, "step": 30680 }, { "epoch": 14.467232437529468, "grad_norm": 1.4678890705108643, "learning_rate": 1.0779633525051608e-05, "loss": 0.0629, "num_input_tokens_seen": 30878784, "step": 30685 }, { "epoch": 14.46958981612447, "grad_norm": 1.1273776292800903, "learning_rate": 1.0771174842641032e-05, "loss": 0.3089, "num_input_tokens_seen": 30884096, "step": 30690 }, { "epoch": 14.471947194719473, "grad_norm": 1.0945712327957153, "learning_rate": 1.0762718568928965e-05, "loss": 0.05, "num_input_tokens_seen": 30889952, "step": 30695 }, { "epoch": 14.474304573314475, "grad_norm": 0.13190224766731262, "learning_rate": 1.0754264705346908e-05, "loss": 0.0171, "num_input_tokens_seen": 30894560, "step": 30700 }, { "epoch": 14.476661951909477, "grad_norm": 1.5199998617172241, "learning_rate": 1.0745813253325957e-05, "loss": 0.2915, "num_input_tokens_seen": 30900288, "step": 30705 }, { "epoch": 14.47901933050448, "grad_norm": 0.124924436211586, "learning_rate": 1.07373642142968e-05, "loss": 0.0569, "num_input_tokens_seen": 30905024, "step": 30710 }, { "epoch": 14.481376709099482, "grad_norm": 0.15036699175834656, "learning_rate": 1.0728917589689725e-05, "loss": 0.0723, "num_input_tokens_seen": 30909408, "step": 30715 }, { "epoch": 14.483734087694483, "grad_norm": 0.056181471794843674, "learning_rate": 1.0720473380934584e-05, "loss": 0.2735, "num_input_tokens_seen": 30914272, "step": 30720 }, { "epoch": 14.486091466289485, "grad_norm": 0.7652191519737244, "learning_rate": 1.0712031589460846e-05, "loss": 0.1284, "num_input_tokens_seen": 30918432, "step": 30725 }, { "epoch": 14.488448844884488, "grad_norm": 0.5846796631813049, "learning_rate": 1.0703592216697567e-05, "loss": 0.147, "num_input_tokens_seen": 30923648, "step": 30730 }, { "epoch": 14.49080622347949, "grad_norm": 0.5903633832931519, "learning_rate": 1.0695155264073384e-05, "loss": 0.1545, "num_input_tokens_seen": 30928736, "step": 30735 }, { "epoch": 14.493163602074493, "grad_norm": 0.9531826376914978, "learning_rate": 1.0686720733016531e-05, "loss": 0.0766, "num_input_tokens_seen": 30933824, "step": 30740 }, { "epoch": 14.495520980669495, "grad_norm": 0.6807532906532288, "learning_rate": 1.0678288624954835e-05, "loss": 0.1384, "num_input_tokens_seen": 30938880, "step": 30745 }, { "epoch": 14.497878359264497, "grad_norm": 0.3077785074710846, "learning_rate": 1.0669858941315705e-05, "loss": 0.0555, "num_input_tokens_seen": 30943392, "step": 30750 }, { "epoch": 14.5002357378595, "grad_norm": 0.16134968400001526, "learning_rate": 1.0661431683526155e-05, "loss": 0.0688, "num_input_tokens_seen": 30948896, "step": 30755 }, { "epoch": 14.502593116454502, "grad_norm": 1.7999204397201538, "learning_rate": 1.0653006853012753e-05, "loss": 0.0918, "num_input_tokens_seen": 30953952, "step": 30760 }, { "epoch": 14.504950495049505, "grad_norm": 2.9431989192962646, "learning_rate": 1.0644584451201694e-05, "loss": 0.1171, "num_input_tokens_seen": 30958048, "step": 30765 }, { "epoch": 14.507307873644507, "grad_norm": 2.2075536251068115, "learning_rate": 1.0636164479518748e-05, "loss": 0.0777, "num_input_tokens_seen": 30962400, "step": 30770 }, { "epoch": 14.50966525223951, "grad_norm": 0.08573810011148453, "learning_rate": 1.0627746939389272e-05, "loss": 0.0112, "num_input_tokens_seen": 30966528, "step": 30775 }, { "epoch": 14.512022630834512, "grad_norm": 0.18888111412525177, "learning_rate": 1.061933183223821e-05, "loss": 0.0298, "num_input_tokens_seen": 30970816, "step": 30780 }, { "epoch": 14.514380009429514, "grad_norm": 1.501300573348999, "learning_rate": 1.061091915949011e-05, "loss": 0.1007, "num_input_tokens_seen": 30976352, "step": 30785 }, { "epoch": 14.516737388024517, "grad_norm": 1.8609309196472168, "learning_rate": 1.0602508922569075e-05, "loss": 0.0987, "num_input_tokens_seen": 30982912, "step": 30790 }, { "epoch": 14.519094766619519, "grad_norm": 0.7108932137489319, "learning_rate": 1.0594101122898823e-05, "loss": 0.1227, "num_input_tokens_seen": 30987744, "step": 30795 }, { "epoch": 14.521452145214521, "grad_norm": 0.8284531235694885, "learning_rate": 1.058569576190265e-05, "loss": 0.1305, "num_input_tokens_seen": 30992544, "step": 30800 }, { "epoch": 14.523809523809524, "grad_norm": 0.04980229213833809, "learning_rate": 1.0577292841003456e-05, "loss": 0.0165, "num_input_tokens_seen": 30997152, "step": 30805 }, { "epoch": 14.526166902404526, "grad_norm": 0.12550140917301178, "learning_rate": 1.056889236162369e-05, "loss": 0.0998, "num_input_tokens_seen": 31001760, "step": 30810 }, { "epoch": 14.528524280999529, "grad_norm": 0.2889016568660736, "learning_rate": 1.0560494325185422e-05, "loss": 0.0282, "num_input_tokens_seen": 31007072, "step": 30815 }, { "epoch": 14.530881659594531, "grad_norm": 0.338598370552063, "learning_rate": 1.0552098733110294e-05, "loss": 0.0557, "num_input_tokens_seen": 31011680, "step": 30820 }, { "epoch": 14.533239038189533, "grad_norm": 0.9266496896743774, "learning_rate": 1.0543705586819541e-05, "loss": 0.1853, "num_input_tokens_seen": 31016032, "step": 30825 }, { "epoch": 14.535596416784536, "grad_norm": 1.589223861694336, "learning_rate": 1.0535314887733977e-05, "loss": 0.2129, "num_input_tokens_seen": 31021728, "step": 30830 }, { "epoch": 14.537953795379538, "grad_norm": 0.1425682157278061, "learning_rate": 1.0526926637274007e-05, "loss": 0.0914, "num_input_tokens_seen": 31026112, "step": 30835 }, { "epoch": 14.54031117397454, "grad_norm": 0.5570306181907654, "learning_rate": 1.0518540836859616e-05, "loss": 0.0332, "num_input_tokens_seen": 31031904, "step": 30840 }, { "epoch": 14.542668552569543, "grad_norm": 0.041960109025239944, "learning_rate": 1.051015748791039e-05, "loss": 0.1062, "num_input_tokens_seen": 31037600, "step": 30845 }, { "epoch": 14.545025931164545, "grad_norm": 0.6714910268783569, "learning_rate": 1.0501776591845472e-05, "loss": 0.078, "num_input_tokens_seen": 31042816, "step": 30850 }, { "epoch": 14.547383309759548, "grad_norm": 0.7442656755447388, "learning_rate": 1.0493398150083608e-05, "loss": 0.0647, "num_input_tokens_seen": 31049088, "step": 30855 }, { "epoch": 14.54974068835455, "grad_norm": 0.03928998485207558, "learning_rate": 1.0485022164043132e-05, "loss": 0.0844, "num_input_tokens_seen": 31053952, "step": 30860 }, { "epoch": 14.552098066949553, "grad_norm": 0.06897688657045364, "learning_rate": 1.047664863514195e-05, "loss": 0.0245, "num_input_tokens_seen": 31059360, "step": 30865 }, { "epoch": 14.554455445544555, "grad_norm": 0.20744629204273224, "learning_rate": 1.0468277564797563e-05, "loss": 0.0132, "num_input_tokens_seen": 31063552, "step": 30870 }, { "epoch": 14.556812824139557, "grad_norm": 0.0407303087413311, "learning_rate": 1.045990895442705e-05, "loss": 0.0338, "num_input_tokens_seen": 31067840, "step": 30875 }, { "epoch": 14.55917020273456, "grad_norm": 0.16026845574378967, "learning_rate": 1.0451542805447073e-05, "loss": 0.1075, "num_input_tokens_seen": 31072864, "step": 30880 }, { "epoch": 14.561527581329562, "grad_norm": 1.7249526977539062, "learning_rate": 1.044317911927389e-05, "loss": 0.0966, "num_input_tokens_seen": 31076992, "step": 30885 }, { "epoch": 14.563884959924565, "grad_norm": 0.2776367664337158, "learning_rate": 1.0434817897323315e-05, "loss": 0.0992, "num_input_tokens_seen": 31081504, "step": 30890 }, { "epoch": 14.566242338519567, "grad_norm": 0.569385290145874, "learning_rate": 1.0426459141010764e-05, "loss": 0.0567, "num_input_tokens_seen": 31086048, "step": 30895 }, { "epoch": 14.56859971711457, "grad_norm": 2.2956702709198, "learning_rate": 1.0418102851751237e-05, "loss": 0.2218, "num_input_tokens_seen": 31090688, "step": 30900 }, { "epoch": 14.570957095709572, "grad_norm": 0.14709143340587616, "learning_rate": 1.0409749030959309e-05, "loss": 0.0554, "num_input_tokens_seen": 31095680, "step": 30905 }, { "epoch": 14.573314474304574, "grad_norm": 1.1100616455078125, "learning_rate": 1.0401397680049143e-05, "loss": 0.0912, "num_input_tokens_seen": 31100096, "step": 30910 }, { "epoch": 14.575671852899575, "grad_norm": 0.9952961802482605, "learning_rate": 1.0393048800434482e-05, "loss": 0.3244, "num_input_tokens_seen": 31104928, "step": 30915 }, { "epoch": 14.578029231494579, "grad_norm": 0.2003326565027237, "learning_rate": 1.0384702393528646e-05, "loss": 0.0453, "num_input_tokens_seen": 31110176, "step": 30920 }, { "epoch": 14.58038661008958, "grad_norm": 1.9229178428649902, "learning_rate": 1.0376358460744549e-05, "loss": 0.1746, "num_input_tokens_seen": 31115456, "step": 30925 }, { "epoch": 14.582743988684582, "grad_norm": 0.28347501158714294, "learning_rate": 1.036801700349466e-05, "loss": 0.0317, "num_input_tokens_seen": 31120320, "step": 30930 }, { "epoch": 14.585101367279584, "grad_norm": 0.7208079695701599, "learning_rate": 1.0359678023191056e-05, "loss": 0.1747, "num_input_tokens_seen": 31124544, "step": 30935 }, { "epoch": 14.587458745874587, "grad_norm": 0.8594056963920593, "learning_rate": 1.0351341521245384e-05, "loss": 0.118, "num_input_tokens_seen": 31129024, "step": 30940 }, { "epoch": 14.58981612446959, "grad_norm": 2.291846752166748, "learning_rate": 1.0343007499068872e-05, "loss": 0.113, "num_input_tokens_seen": 31135104, "step": 30945 }, { "epoch": 14.592173503064592, "grad_norm": 0.07477030903100967, "learning_rate": 1.0334675958072329e-05, "loss": 0.0813, "num_input_tokens_seen": 31141312, "step": 30950 }, { "epoch": 14.594530881659594, "grad_norm": 0.126731738448143, "learning_rate": 1.0326346899666142e-05, "loss": 0.1365, "num_input_tokens_seen": 31147296, "step": 30955 }, { "epoch": 14.596888260254596, "grad_norm": 0.13605792820453644, "learning_rate": 1.0318020325260282e-05, "loss": 0.0455, "num_input_tokens_seen": 31152768, "step": 30960 }, { "epoch": 14.599245638849599, "grad_norm": 1.593845009803772, "learning_rate": 1.0309696236264304e-05, "loss": 0.0567, "num_input_tokens_seen": 31157824, "step": 30965 }, { "epoch": 14.601603017444601, "grad_norm": 0.3157898783683777, "learning_rate": 1.030137463408732e-05, "loss": 0.1849, "num_input_tokens_seen": 31164224, "step": 30970 }, { "epoch": 14.603960396039604, "grad_norm": 0.05093662440776825, "learning_rate": 1.0293055520138037e-05, "loss": 0.0566, "num_input_tokens_seen": 31169056, "step": 30975 }, { "epoch": 14.606317774634606, "grad_norm": 2.4747307300567627, "learning_rate": 1.0284738895824748e-05, "loss": 0.1621, "num_input_tokens_seen": 31174976, "step": 30980 }, { "epoch": 14.608675153229608, "grad_norm": 0.05469079315662384, "learning_rate": 1.0276424762555308e-05, "loss": 0.0617, "num_input_tokens_seen": 31179744, "step": 30985 }, { "epoch": 14.61103253182461, "grad_norm": 0.07992889732122421, "learning_rate": 1.0268113121737169e-05, "loss": 0.0863, "num_input_tokens_seen": 31184416, "step": 30990 }, { "epoch": 14.613389910419613, "grad_norm": 0.4336244463920593, "learning_rate": 1.0259803974777343e-05, "loss": 0.0857, "num_input_tokens_seen": 31189216, "step": 30995 }, { "epoch": 14.615747289014616, "grad_norm": 0.01821434497833252, "learning_rate": 1.0251497323082432e-05, "loss": 0.0707, "num_input_tokens_seen": 31193216, "step": 31000 }, { "epoch": 14.618104667609618, "grad_norm": 0.050286464393138885, "learning_rate": 1.0243193168058616e-05, "loss": 0.0196, "num_input_tokens_seen": 31197600, "step": 31005 }, { "epoch": 14.62046204620462, "grad_norm": 0.16387593746185303, "learning_rate": 1.0234891511111633e-05, "loss": 0.1148, "num_input_tokens_seen": 31202752, "step": 31010 }, { "epoch": 14.622819424799623, "grad_norm": 0.6990196108818054, "learning_rate": 1.0226592353646828e-05, "loss": 0.0634, "num_input_tokens_seen": 31208256, "step": 31015 }, { "epoch": 14.625176803394625, "grad_norm": 0.8008877635002136, "learning_rate": 1.0218295697069089e-05, "loss": 0.0526, "num_input_tokens_seen": 31212416, "step": 31020 }, { "epoch": 14.627534181989628, "grad_norm": 3.0835120677948, "learning_rate": 1.021000154278291e-05, "loss": 0.1146, "num_input_tokens_seen": 31218528, "step": 31025 }, { "epoch": 14.62989156058463, "grad_norm": 0.8151105642318726, "learning_rate": 1.0201709892192352e-05, "loss": 0.0807, "num_input_tokens_seen": 31222208, "step": 31030 }, { "epoch": 14.632248939179632, "grad_norm": 0.054315708577632904, "learning_rate": 1.0193420746701047e-05, "loss": 0.043, "num_input_tokens_seen": 31226560, "step": 31035 }, { "epoch": 14.634606317774635, "grad_norm": 0.2779933214187622, "learning_rate": 1.0185134107712208e-05, "loss": 0.0279, "num_input_tokens_seen": 31231264, "step": 31040 }, { "epoch": 14.636963696369637, "grad_norm": 1.479953408241272, "learning_rate": 1.0176849976628622e-05, "loss": 0.1611, "num_input_tokens_seen": 31235648, "step": 31045 }, { "epoch": 14.63932107496464, "grad_norm": 0.4118720293045044, "learning_rate": 1.0168568354852651e-05, "loss": 0.0595, "num_input_tokens_seen": 31241344, "step": 31050 }, { "epoch": 14.641678453559642, "grad_norm": 0.5987028479576111, "learning_rate": 1.0160289243786245e-05, "loss": 0.0491, "num_input_tokens_seen": 31245376, "step": 31055 }, { "epoch": 14.644035832154644, "grad_norm": 0.8146695494651794, "learning_rate": 1.0152012644830897e-05, "loss": 0.2576, "num_input_tokens_seen": 31251520, "step": 31060 }, { "epoch": 14.646393210749647, "grad_norm": 0.10274015367031097, "learning_rate": 1.0143738559387702e-05, "loss": 0.0908, "num_input_tokens_seen": 31257824, "step": 31065 }, { "epoch": 14.64875058934465, "grad_norm": 0.018830452114343643, "learning_rate": 1.0135466988857329e-05, "loss": 0.2127, "num_input_tokens_seen": 31262016, "step": 31070 }, { "epoch": 14.651107967939652, "grad_norm": 0.09339199960231781, "learning_rate": 1.0127197934640003e-05, "loss": 0.2105, "num_input_tokens_seen": 31266592, "step": 31075 }, { "epoch": 14.653465346534654, "grad_norm": 0.18639099597930908, "learning_rate": 1.0118931398135545e-05, "loss": 0.1024, "num_input_tokens_seen": 31272096, "step": 31080 }, { "epoch": 14.655822725129656, "grad_norm": 0.020605146884918213, "learning_rate": 1.0110667380743335e-05, "loss": 0.0156, "num_input_tokens_seen": 31276768, "step": 31085 }, { "epoch": 14.658180103724659, "grad_norm": 0.13475696742534637, "learning_rate": 1.0102405883862328e-05, "loss": 0.0339, "num_input_tokens_seen": 31281696, "step": 31090 }, { "epoch": 14.660537482319661, "grad_norm": 0.13873641192913055, "learning_rate": 1.009414690889107e-05, "loss": 0.0897, "num_input_tokens_seen": 31287232, "step": 31095 }, { "epoch": 14.662894860914664, "grad_norm": 1.2497766017913818, "learning_rate": 1.0085890457227645e-05, "loss": 0.0706, "num_input_tokens_seen": 31292192, "step": 31100 }, { "epoch": 14.665252239509666, "grad_norm": 0.3127673268318176, "learning_rate": 1.0077636530269735e-05, "loss": 0.0644, "num_input_tokens_seen": 31298208, "step": 31105 }, { "epoch": 14.667609618104667, "grad_norm": 0.8264480829238892, "learning_rate": 1.0069385129414596e-05, "loss": 0.0932, "num_input_tokens_seen": 31303616, "step": 31110 }, { "epoch": 14.66996699669967, "grad_norm": 0.16557979583740234, "learning_rate": 1.0061136256059045e-05, "loss": 0.1217, "num_input_tokens_seen": 31310368, "step": 31115 }, { "epoch": 14.672324375294671, "grad_norm": 0.6569893956184387, "learning_rate": 1.005288991159948e-05, "loss": 0.0648, "num_input_tokens_seen": 31314816, "step": 31120 }, { "epoch": 14.674681753889674, "grad_norm": 0.13446782529354095, "learning_rate": 1.0044646097431865e-05, "loss": 0.1038, "num_input_tokens_seen": 31319776, "step": 31125 }, { "epoch": 14.677039132484676, "grad_norm": 1.9022736549377441, "learning_rate": 1.0036404814951739e-05, "loss": 0.1547, "num_input_tokens_seen": 31325824, "step": 31130 }, { "epoch": 14.679396511079679, "grad_norm": 1.2154263257980347, "learning_rate": 1.0028166065554218e-05, "loss": 0.1233, "num_input_tokens_seen": 31330688, "step": 31135 }, { "epoch": 14.681753889674681, "grad_norm": 0.13670511543750763, "learning_rate": 1.0019929850633967e-05, "loss": 0.0848, "num_input_tokens_seen": 31335136, "step": 31140 }, { "epoch": 14.684111268269683, "grad_norm": 0.19287028908729553, "learning_rate": 1.0011696171585245e-05, "loss": 0.1155, "num_input_tokens_seen": 31340352, "step": 31145 }, { "epoch": 14.686468646864686, "grad_norm": 2.378803253173828, "learning_rate": 1.0003465029801873e-05, "loss": 0.1926, "num_input_tokens_seen": 31344896, "step": 31150 }, { "epoch": 14.688826025459688, "grad_norm": 0.0710732564330101, "learning_rate": 9.995236426677245e-06, "loss": 0.212, "num_input_tokens_seen": 31349984, "step": 31155 }, { "epoch": 14.69118340405469, "grad_norm": 0.058476515114307404, "learning_rate": 9.987010363604327e-06, "loss": 0.2056, "num_input_tokens_seen": 31354592, "step": 31160 }, { "epoch": 14.693540782649693, "grad_norm": 1.0257091522216797, "learning_rate": 9.978786841975646e-06, "loss": 0.173, "num_input_tokens_seen": 31359072, "step": 31165 }, { "epoch": 14.695898161244696, "grad_norm": 0.03631589561700821, "learning_rate": 9.970565863183306e-06, "loss": 0.08, "num_input_tokens_seen": 31365216, "step": 31170 }, { "epoch": 14.698255539839698, "grad_norm": 0.07287703454494476, "learning_rate": 9.96234742861899e-06, "loss": 0.1001, "num_input_tokens_seen": 31370144, "step": 31175 }, { "epoch": 14.7006129184347, "grad_norm": 1.6768701076507568, "learning_rate": 9.95413153967392e-06, "loss": 0.0769, "num_input_tokens_seen": 31374400, "step": 31180 }, { "epoch": 14.702970297029703, "grad_norm": 1.5474108457565308, "learning_rate": 9.945918197738915e-06, "loss": 0.2135, "num_input_tokens_seen": 31378688, "step": 31185 }, { "epoch": 14.705327675624705, "grad_norm": 0.22745084762573242, "learning_rate": 9.937707404204358e-06, "loss": 0.0274, "num_input_tokens_seen": 31383424, "step": 31190 }, { "epoch": 14.707685054219708, "grad_norm": 0.26247289776802063, "learning_rate": 9.929499160460193e-06, "loss": 0.0838, "num_input_tokens_seen": 31387840, "step": 31195 }, { "epoch": 14.71004243281471, "grad_norm": 0.19461356103420258, "learning_rate": 9.921293467895937e-06, "loss": 0.1856, "num_input_tokens_seen": 31392000, "step": 31200 }, { "epoch": 14.712399811409712, "grad_norm": 2.059631109237671, "learning_rate": 9.913090327900677e-06, "loss": 0.0624, "num_input_tokens_seen": 31396224, "step": 31205 }, { "epoch": 14.714757190004715, "grad_norm": 0.04731063172221184, "learning_rate": 9.90488974186306e-06, "loss": 0.0389, "num_input_tokens_seen": 31401472, "step": 31210 }, { "epoch": 14.717114568599717, "grad_norm": 0.7537217736244202, "learning_rate": 9.89669171117132e-06, "loss": 0.0284, "num_input_tokens_seen": 31405696, "step": 31215 }, { "epoch": 14.71947194719472, "grad_norm": 1.000540852546692, "learning_rate": 9.888496237213226e-06, "loss": 0.0927, "num_input_tokens_seen": 31410400, "step": 31220 }, { "epoch": 14.721829325789722, "grad_norm": 0.5202052593231201, "learning_rate": 9.880303321376139e-06, "loss": 0.0771, "num_input_tokens_seen": 31415040, "step": 31225 }, { "epoch": 14.724186704384724, "grad_norm": 1.2918763160705566, "learning_rate": 9.872112965046984e-06, "loss": 0.075, "num_input_tokens_seen": 31419776, "step": 31230 }, { "epoch": 14.726544082979727, "grad_norm": 0.7402909994125366, "learning_rate": 9.863925169612253e-06, "loss": 0.1926, "num_input_tokens_seen": 31427008, "step": 31235 }, { "epoch": 14.72890146157473, "grad_norm": 1.4395480155944824, "learning_rate": 9.855739936457986e-06, "loss": 0.0636, "num_input_tokens_seen": 31432064, "step": 31240 }, { "epoch": 14.731258840169732, "grad_norm": 1.435276746749878, "learning_rate": 9.847557266969814e-06, "loss": 0.1413, "num_input_tokens_seen": 31436160, "step": 31245 }, { "epoch": 14.733616218764734, "grad_norm": 0.03643229976296425, "learning_rate": 9.839377162532924e-06, "loss": 0.0398, "num_input_tokens_seen": 31440000, "step": 31250 }, { "epoch": 14.735973597359736, "grad_norm": 0.3409044146537781, "learning_rate": 9.831199624532067e-06, "loss": 0.1318, "num_input_tokens_seen": 31445952, "step": 31255 }, { "epoch": 14.738330975954739, "grad_norm": 2.924180030822754, "learning_rate": 9.823024654351562e-06, "loss": 0.0771, "num_input_tokens_seen": 31450336, "step": 31260 }, { "epoch": 14.740688354549741, "grad_norm": 1.0838422775268555, "learning_rate": 9.8148522533753e-06, "loss": 0.0593, "num_input_tokens_seen": 31456608, "step": 31265 }, { "epoch": 14.743045733144744, "grad_norm": 0.03518535941839218, "learning_rate": 9.806682422986715e-06, "loss": 0.0599, "num_input_tokens_seen": 31460864, "step": 31270 }, { "epoch": 14.745403111739746, "grad_norm": 0.2754179835319519, "learning_rate": 9.798515164568827e-06, "loss": 0.1384, "num_input_tokens_seen": 31466304, "step": 31275 }, { "epoch": 14.747760490334748, "grad_norm": 0.20062941312789917, "learning_rate": 9.790350479504215e-06, "loss": 0.1223, "num_input_tokens_seen": 31471072, "step": 31280 }, { "epoch": 14.75011786892975, "grad_norm": 1.7613650560379028, "learning_rate": 9.782188369175024e-06, "loss": 0.1419, "num_input_tokens_seen": 31475360, "step": 31285 }, { "epoch": 14.752475247524753, "grad_norm": 0.9932823181152344, "learning_rate": 9.774028834962956e-06, "loss": 0.0343, "num_input_tokens_seen": 31480896, "step": 31290 }, { "epoch": 14.754832626119756, "grad_norm": 0.4214668571949005, "learning_rate": 9.765871878249286e-06, "loss": 0.1234, "num_input_tokens_seen": 31486048, "step": 31295 }, { "epoch": 14.757190004714758, "grad_norm": 0.2061941921710968, "learning_rate": 9.757717500414842e-06, "loss": 0.0293, "num_input_tokens_seen": 31490752, "step": 31300 }, { "epoch": 14.75954738330976, "grad_norm": 0.46446216106414795, "learning_rate": 9.749565702840038e-06, "loss": 0.1201, "num_input_tokens_seen": 31494848, "step": 31305 }, { "epoch": 14.761904761904763, "grad_norm": 0.6329752206802368, "learning_rate": 9.741416486904814e-06, "loss": 0.1159, "num_input_tokens_seen": 31500256, "step": 31310 }, { "epoch": 14.764262140499763, "grad_norm": 0.7489258050918579, "learning_rate": 9.733269853988703e-06, "loss": 0.0852, "num_input_tokens_seen": 31505344, "step": 31315 }, { "epoch": 14.766619519094768, "grad_norm": 0.14510075747966766, "learning_rate": 9.725125805470792e-06, "loss": 0.1946, "num_input_tokens_seen": 31512608, "step": 31320 }, { "epoch": 14.768976897689768, "grad_norm": 1.55421781539917, "learning_rate": 9.716984342729727e-06, "loss": 0.2275, "num_input_tokens_seen": 31519200, "step": 31325 }, { "epoch": 14.77133427628477, "grad_norm": 0.1438112109899521, "learning_rate": 9.708845467143723e-06, "loss": 0.1303, "num_input_tokens_seen": 31523712, "step": 31330 }, { "epoch": 14.773691654879773, "grad_norm": 0.4619964361190796, "learning_rate": 9.700709180090553e-06, "loss": 0.0204, "num_input_tokens_seen": 31528416, "step": 31335 }, { "epoch": 14.776049033474775, "grad_norm": 1.1127433776855469, "learning_rate": 9.692575482947552e-06, "loss": 0.1504, "num_input_tokens_seen": 31534208, "step": 31340 }, { "epoch": 14.778406412069778, "grad_norm": 1.1802042722702026, "learning_rate": 9.684444377091625e-06, "loss": 0.1112, "num_input_tokens_seen": 31539744, "step": 31345 }, { "epoch": 14.78076379066478, "grad_norm": 1.3853262662887573, "learning_rate": 9.676315863899213e-06, "loss": 0.115, "num_input_tokens_seen": 31545376, "step": 31350 }, { "epoch": 14.783121169259783, "grad_norm": 1.4598944187164307, "learning_rate": 9.668189944746343e-06, "loss": 0.1022, "num_input_tokens_seen": 31551424, "step": 31355 }, { "epoch": 14.785478547854785, "grad_norm": 1.5974806547164917, "learning_rate": 9.660066621008595e-06, "loss": 0.3049, "num_input_tokens_seen": 31555840, "step": 31360 }, { "epoch": 14.787835926449787, "grad_norm": 1.5884798765182495, "learning_rate": 9.651945894061112e-06, "loss": 0.1042, "num_input_tokens_seen": 31560416, "step": 31365 }, { "epoch": 14.79019330504479, "grad_norm": 2.9446022510528564, "learning_rate": 9.643827765278596e-06, "loss": 0.1535, "num_input_tokens_seen": 31565216, "step": 31370 }, { "epoch": 14.792550683639792, "grad_norm": 1.726260781288147, "learning_rate": 9.635712236035305e-06, "loss": 0.1064, "num_input_tokens_seen": 31569888, "step": 31375 }, { "epoch": 14.794908062234795, "grad_norm": 0.3539672791957855, "learning_rate": 9.62759930770506e-06, "loss": 0.1124, "num_input_tokens_seen": 31574656, "step": 31380 }, { "epoch": 14.797265440829797, "grad_norm": 0.19480328261852264, "learning_rate": 9.619488981661254e-06, "loss": 0.079, "num_input_tokens_seen": 31579552, "step": 31385 }, { "epoch": 14.7996228194248, "grad_norm": 0.055160123854875565, "learning_rate": 9.611381259276808e-06, "loss": 0.0346, "num_input_tokens_seen": 31584960, "step": 31390 }, { "epoch": 14.801980198019802, "grad_norm": 1.9950356483459473, "learning_rate": 9.60327614192423e-06, "loss": 0.2221, "num_input_tokens_seen": 31589728, "step": 31395 }, { "epoch": 14.804337576614804, "grad_norm": 2.285965919494629, "learning_rate": 9.595173630975582e-06, "loss": 0.1251, "num_input_tokens_seen": 31594080, "step": 31400 }, { "epoch": 14.806694955209807, "grad_norm": 0.2078143060207367, "learning_rate": 9.587073727802478e-06, "loss": 0.0282, "num_input_tokens_seen": 31598656, "step": 31405 }, { "epoch": 14.809052333804809, "grad_norm": 0.2519935071468353, "learning_rate": 9.578976433776096e-06, "loss": 0.0943, "num_input_tokens_seen": 31603200, "step": 31410 }, { "epoch": 14.811409712399811, "grad_norm": 0.20222234725952148, "learning_rate": 9.57088175026717e-06, "loss": 0.0809, "num_input_tokens_seen": 31608704, "step": 31415 }, { "epoch": 14.813767090994814, "grad_norm": 2.9358935356140137, "learning_rate": 9.56278967864599e-06, "loss": 0.1564, "num_input_tokens_seen": 31614016, "step": 31420 }, { "epoch": 14.816124469589816, "grad_norm": 0.37047767639160156, "learning_rate": 9.554700220282417e-06, "loss": 0.0122, "num_input_tokens_seen": 31618656, "step": 31425 }, { "epoch": 14.818481848184819, "grad_norm": 0.6925981640815735, "learning_rate": 9.546613376545844e-06, "loss": 0.0932, "num_input_tokens_seen": 31623872, "step": 31430 }, { "epoch": 14.820839226779821, "grad_norm": 0.5572097301483154, "learning_rate": 9.538529148805242e-06, "loss": 0.0923, "num_input_tokens_seen": 31629184, "step": 31435 }, { "epoch": 14.823196605374823, "grad_norm": 2.544020652770996, "learning_rate": 9.530447538429135e-06, "loss": 0.1809, "num_input_tokens_seen": 31633376, "step": 31440 }, { "epoch": 14.825553983969826, "grad_norm": 1.0757085084915161, "learning_rate": 9.522368546785602e-06, "loss": 0.121, "num_input_tokens_seen": 31637632, "step": 31445 }, { "epoch": 14.827911362564828, "grad_norm": 0.1910233050584793, "learning_rate": 9.514292175242277e-06, "loss": 0.0249, "num_input_tokens_seen": 31642304, "step": 31450 }, { "epoch": 14.83026874115983, "grad_norm": 1.605124592781067, "learning_rate": 9.506218425166355e-06, "loss": 0.2357, "num_input_tokens_seen": 31646720, "step": 31455 }, { "epoch": 14.832626119754833, "grad_norm": 0.28680476546287537, "learning_rate": 9.498147297924593e-06, "loss": 0.0863, "num_input_tokens_seen": 31651200, "step": 31460 }, { "epoch": 14.834983498349835, "grad_norm": 0.6002310514450073, "learning_rate": 9.490078794883277e-06, "loss": 0.1222, "num_input_tokens_seen": 31655776, "step": 31465 }, { "epoch": 14.837340876944838, "grad_norm": 1.6502214670181274, "learning_rate": 9.482012917408286e-06, "loss": 0.0854, "num_input_tokens_seen": 31661408, "step": 31470 }, { "epoch": 14.83969825553984, "grad_norm": 1.6343587636947632, "learning_rate": 9.473949666865023e-06, "loss": 0.2087, "num_input_tokens_seen": 31667584, "step": 31475 }, { "epoch": 14.842055634134843, "grad_norm": 0.06391452997922897, "learning_rate": 9.465889044618462e-06, "loss": 0.07, "num_input_tokens_seen": 31671904, "step": 31480 }, { "epoch": 14.844413012729845, "grad_norm": 0.29891499876976013, "learning_rate": 9.457831052033132e-06, "loss": 0.0571, "num_input_tokens_seen": 31677152, "step": 31485 }, { "epoch": 14.846770391324847, "grad_norm": 0.027868080884218216, "learning_rate": 9.449775690473112e-06, "loss": 0.0404, "num_input_tokens_seen": 31682720, "step": 31490 }, { "epoch": 14.84912776991985, "grad_norm": 0.9880334734916687, "learning_rate": 9.441722961302039e-06, "loss": 0.133, "num_input_tokens_seen": 31687392, "step": 31495 }, { "epoch": 14.851485148514852, "grad_norm": 0.2070612758398056, "learning_rate": 9.433672865883106e-06, "loss": 0.0593, "num_input_tokens_seen": 31691904, "step": 31500 }, { "epoch": 14.853842527109855, "grad_norm": 3.036787509918213, "learning_rate": 9.425625405579055e-06, "loss": 0.1888, "num_input_tokens_seen": 31696512, "step": 31505 }, { "epoch": 14.856199905704855, "grad_norm": 0.24962051212787628, "learning_rate": 9.417580581752194e-06, "loss": 0.0196, "num_input_tokens_seen": 31703232, "step": 31510 }, { "epoch": 14.85855728429986, "grad_norm": 0.09652592986822128, "learning_rate": 9.409538395764358e-06, "loss": 0.2044, "num_input_tokens_seen": 31708000, "step": 31515 }, { "epoch": 14.86091466289486, "grad_norm": 0.019023224711418152, "learning_rate": 9.401498848976962e-06, "loss": 0.1011, "num_input_tokens_seen": 31713120, "step": 31520 }, { "epoch": 14.863272041489862, "grad_norm": 1.2705659866333008, "learning_rate": 9.393461942750961e-06, "loss": 0.2202, "num_input_tokens_seen": 31718528, "step": 31525 }, { "epoch": 14.865629420084865, "grad_norm": 0.06200871616601944, "learning_rate": 9.38542767844687e-06, "loss": 0.1559, "num_input_tokens_seen": 31723488, "step": 31530 }, { "epoch": 14.867986798679867, "grad_norm": 1.683730125427246, "learning_rate": 9.377396057424751e-06, "loss": 0.1519, "num_input_tokens_seen": 31729536, "step": 31535 }, { "epoch": 14.87034417727487, "grad_norm": 0.23173542320728302, "learning_rate": 9.369367081044226e-06, "loss": 0.1879, "num_input_tokens_seen": 31733984, "step": 31540 }, { "epoch": 14.872701555869872, "grad_norm": 0.8855547308921814, "learning_rate": 9.361340750664458e-06, "loss": 0.0318, "num_input_tokens_seen": 31739840, "step": 31545 }, { "epoch": 14.875058934464874, "grad_norm": 0.2134646475315094, "learning_rate": 9.353317067644182e-06, "loss": 0.0761, "num_input_tokens_seen": 31745024, "step": 31550 }, { "epoch": 14.877416313059877, "grad_norm": 1.574973464012146, "learning_rate": 9.345296033341652e-06, "loss": 0.23, "num_input_tokens_seen": 31749536, "step": 31555 }, { "epoch": 14.87977369165488, "grad_norm": 0.6603807806968689, "learning_rate": 9.3372776491147e-06, "loss": 0.0844, "num_input_tokens_seen": 31754528, "step": 31560 }, { "epoch": 14.882131070249882, "grad_norm": 0.1487075388431549, "learning_rate": 9.329261916320709e-06, "loss": 0.1292, "num_input_tokens_seen": 31759168, "step": 31565 }, { "epoch": 14.884488448844884, "grad_norm": 0.050144899636507034, "learning_rate": 9.321248836316596e-06, "loss": 0.0143, "num_input_tokens_seen": 31763200, "step": 31570 }, { "epoch": 14.886845827439886, "grad_norm": 0.24797090888023376, "learning_rate": 9.313238410458849e-06, "loss": 0.068, "num_input_tokens_seen": 31768192, "step": 31575 }, { "epoch": 14.889203206034889, "grad_norm": 0.4417089521884918, "learning_rate": 9.30523064010349e-06, "loss": 0.0436, "num_input_tokens_seen": 31773344, "step": 31580 }, { "epoch": 14.891560584629891, "grad_norm": 0.5716370940208435, "learning_rate": 9.297225526606104e-06, "loss": 0.1456, "num_input_tokens_seen": 31778240, "step": 31585 }, { "epoch": 14.893917963224894, "grad_norm": 0.04327535256743431, "learning_rate": 9.289223071321826e-06, "loss": 0.2107, "num_input_tokens_seen": 31783552, "step": 31590 }, { "epoch": 14.896275341819896, "grad_norm": 0.05716390162706375, "learning_rate": 9.281223275605319e-06, "loss": 0.0949, "num_input_tokens_seen": 31790080, "step": 31595 }, { "epoch": 14.898632720414899, "grad_norm": 0.4160711467266083, "learning_rate": 9.273226140810822e-06, "loss": 0.0486, "num_input_tokens_seen": 31795296, "step": 31600 }, { "epoch": 14.900990099009901, "grad_norm": 0.5239654183387756, "learning_rate": 9.265231668292114e-06, "loss": 0.0907, "num_input_tokens_seen": 31799680, "step": 31605 }, { "epoch": 14.903347477604903, "grad_norm": 0.6329130530357361, "learning_rate": 9.257239859402525e-06, "loss": 0.1464, "num_input_tokens_seen": 31804544, "step": 31610 }, { "epoch": 14.905704856199906, "grad_norm": 1.8567019701004028, "learning_rate": 9.24925071549493e-06, "loss": 0.1831, "num_input_tokens_seen": 31809056, "step": 31615 }, { "epoch": 14.908062234794908, "grad_norm": 0.5095946788787842, "learning_rate": 9.241264237921757e-06, "loss": 0.0597, "num_input_tokens_seen": 31813440, "step": 31620 }, { "epoch": 14.91041961338991, "grad_norm": 1.6740025281906128, "learning_rate": 9.23328042803498e-06, "loss": 0.3281, "num_input_tokens_seen": 31818848, "step": 31625 }, { "epoch": 14.912776991984913, "grad_norm": 0.3052058815956116, "learning_rate": 9.22529928718613e-06, "loss": 0.1235, "num_input_tokens_seen": 31824032, "step": 31630 }, { "epoch": 14.915134370579915, "grad_norm": 1.1108684539794922, "learning_rate": 9.217320816726263e-06, "loss": 0.1096, "num_input_tokens_seen": 31828832, "step": 31635 }, { "epoch": 14.917491749174918, "grad_norm": 1.1433227062225342, "learning_rate": 9.209345018006008e-06, "loss": 0.0977, "num_input_tokens_seen": 31834112, "step": 31640 }, { "epoch": 14.91984912776992, "grad_norm": 0.7520899772644043, "learning_rate": 9.201371892375533e-06, "loss": 0.0794, "num_input_tokens_seen": 31839872, "step": 31645 }, { "epoch": 14.922206506364923, "grad_norm": 1.2246290445327759, "learning_rate": 9.19340144118455e-06, "loss": 0.223, "num_input_tokens_seen": 31845152, "step": 31650 }, { "epoch": 14.924563884959925, "grad_norm": 1.0135993957519531, "learning_rate": 9.18543366578232e-06, "loss": 0.0653, "num_input_tokens_seen": 31849376, "step": 31655 }, { "epoch": 14.926921263554927, "grad_norm": 2.197679042816162, "learning_rate": 9.177468567517656e-06, "loss": 0.1783, "num_input_tokens_seen": 31854464, "step": 31660 }, { "epoch": 14.92927864214993, "grad_norm": 0.15537706017494202, "learning_rate": 9.169506147738913e-06, "loss": 0.0803, "num_input_tokens_seen": 31858976, "step": 31665 }, { "epoch": 14.931636020744932, "grad_norm": 1.3552597761154175, "learning_rate": 9.161546407794e-06, "loss": 0.1634, "num_input_tokens_seen": 31863616, "step": 31670 }, { "epoch": 14.933993399339935, "grad_norm": 2.461122989654541, "learning_rate": 9.153589349030351e-06, "loss": 0.2116, "num_input_tokens_seen": 31868288, "step": 31675 }, { "epoch": 14.936350777934937, "grad_norm": 0.5417162775993347, "learning_rate": 9.14563497279497e-06, "loss": 0.0605, "num_input_tokens_seen": 31874016, "step": 31680 }, { "epoch": 14.93870815652994, "grad_norm": 0.6104033589363098, "learning_rate": 9.1376832804344e-06, "loss": 0.1119, "num_input_tokens_seen": 31879072, "step": 31685 }, { "epoch": 14.941065535124942, "grad_norm": 0.016406817361712456, "learning_rate": 9.129734273294718e-06, "loss": 0.0295, "num_input_tokens_seen": 31884576, "step": 31690 }, { "epoch": 14.943422913719944, "grad_norm": 0.713245689868927, "learning_rate": 9.121787952721564e-06, "loss": 0.067, "num_input_tokens_seen": 31890816, "step": 31695 }, { "epoch": 14.945780292314947, "grad_norm": 0.19671551883220673, "learning_rate": 9.113844320060111e-06, "loss": 0.1276, "num_input_tokens_seen": 31895488, "step": 31700 }, { "epoch": 14.948137670909949, "grad_norm": 0.9070272445678711, "learning_rate": 9.10590337665508e-06, "loss": 0.0904, "num_input_tokens_seen": 31901408, "step": 31705 }, { "epoch": 14.950495049504951, "grad_norm": 1.890116572380066, "learning_rate": 9.097965123850746e-06, "loss": 0.0873, "num_input_tokens_seen": 31906336, "step": 31710 }, { "epoch": 14.952852428099952, "grad_norm": 0.04097862169146538, "learning_rate": 9.090029562990911e-06, "loss": 0.0766, "num_input_tokens_seen": 31910560, "step": 31715 }, { "epoch": 14.955209806694956, "grad_norm": 0.3093239665031433, "learning_rate": 9.082096695418949e-06, "loss": 0.0709, "num_input_tokens_seen": 31915200, "step": 31720 }, { "epoch": 14.957567185289957, "grad_norm": 0.10829099267721176, "learning_rate": 9.074166522477734e-06, "loss": 0.1328, "num_input_tokens_seen": 31920960, "step": 31725 }, { "epoch": 14.95992456388496, "grad_norm": 1.1033517122268677, "learning_rate": 9.066239045509723e-06, "loss": 0.164, "num_input_tokens_seen": 31926080, "step": 31730 }, { "epoch": 14.962281942479962, "grad_norm": 0.14185422658920288, "learning_rate": 9.058314265856899e-06, "loss": 0.11, "num_input_tokens_seen": 31932160, "step": 31735 }, { "epoch": 14.964639321074964, "grad_norm": 1.067014217376709, "learning_rate": 9.050392184860796e-06, "loss": 0.1241, "num_input_tokens_seen": 31937536, "step": 31740 }, { "epoch": 14.966996699669966, "grad_norm": 0.2085096538066864, "learning_rate": 9.042472803862489e-06, "loss": 0.0586, "num_input_tokens_seen": 31942976, "step": 31745 }, { "epoch": 14.969354078264969, "grad_norm": 3.1386985778808594, "learning_rate": 9.034556124202592e-06, "loss": 0.3403, "num_input_tokens_seen": 31947584, "step": 31750 }, { "epoch": 14.971711456859971, "grad_norm": 0.1665632426738739, "learning_rate": 9.026642147221264e-06, "loss": 0.0467, "num_input_tokens_seen": 31952096, "step": 31755 }, { "epoch": 14.974068835454974, "grad_norm": 0.41649362444877625, "learning_rate": 9.018730874258219e-06, "loss": 0.2326, "num_input_tokens_seen": 31957504, "step": 31760 }, { "epoch": 14.976426214049976, "grad_norm": 0.5664517879486084, "learning_rate": 9.01082230665268e-06, "loss": 0.1468, "num_input_tokens_seen": 31962112, "step": 31765 }, { "epoch": 14.978783592644978, "grad_norm": 1.038396954536438, "learning_rate": 9.002916445743445e-06, "loss": 0.0775, "num_input_tokens_seen": 31966880, "step": 31770 }, { "epoch": 14.98114097123998, "grad_norm": 1.830810546875, "learning_rate": 8.99501329286884e-06, "loss": 0.1818, "num_input_tokens_seen": 31971488, "step": 31775 }, { "epoch": 14.983498349834983, "grad_norm": 2.0516581535339355, "learning_rate": 8.987112849366735e-06, "loss": 0.1481, "num_input_tokens_seen": 31977152, "step": 31780 }, { "epoch": 14.985855728429986, "grad_norm": 0.04832421615719795, "learning_rate": 8.979215116574544e-06, "loss": 0.1713, "num_input_tokens_seen": 31981568, "step": 31785 }, { "epoch": 14.988213107024988, "grad_norm": 0.42999643087387085, "learning_rate": 8.971320095829214e-06, "loss": 0.1665, "num_input_tokens_seen": 31987488, "step": 31790 }, { "epoch": 14.99057048561999, "grad_norm": 2.39227294921875, "learning_rate": 8.963427788467241e-06, "loss": 0.13, "num_input_tokens_seen": 31992672, "step": 31795 }, { "epoch": 14.992927864214993, "grad_norm": 0.03053586184978485, "learning_rate": 8.955538195824664e-06, "loss": 0.0661, "num_input_tokens_seen": 31997472, "step": 31800 }, { "epoch": 14.995285242809995, "grad_norm": 0.35995715856552124, "learning_rate": 8.947651319237046e-06, "loss": 0.1253, "num_input_tokens_seen": 32003456, "step": 31805 }, { "epoch": 14.997642621404998, "grad_norm": 0.22401276230812073, "learning_rate": 8.939767160039503e-06, "loss": 0.1677, "num_input_tokens_seen": 32008512, "step": 31810 }, { "epoch": 15.0, "grad_norm": 0.20752137899398804, "learning_rate": 8.931885719566697e-06, "loss": 0.0246, "num_input_tokens_seen": 32013760, "step": 31815 }, { "epoch": 15.0, "eval_loss": 0.153660848736763, "eval_runtime": 15.1398, "eval_samples_per_second": 62.286, "eval_steps_per_second": 15.588, "num_input_tokens_seen": 32013760, "step": 31815 }, { "epoch": 15.002357378595002, "grad_norm": 0.0801275372505188, "learning_rate": 8.924006999152815e-06, "loss": 0.0704, "num_input_tokens_seen": 32018688, "step": 31820 }, { "epoch": 15.004714757190005, "grad_norm": 1.158119797706604, "learning_rate": 8.916131000131592e-06, "loss": 0.1114, "num_input_tokens_seen": 32025088, "step": 31825 }, { "epoch": 15.007072135785007, "grad_norm": 0.4533681869506836, "learning_rate": 8.908257723836303e-06, "loss": 0.014, "num_input_tokens_seen": 32029184, "step": 31830 }, { "epoch": 15.00942951438001, "grad_norm": 1.1901681423187256, "learning_rate": 8.900387171599759e-06, "loss": 0.1588, "num_input_tokens_seen": 32033920, "step": 31835 }, { "epoch": 15.011786892975012, "grad_norm": 2.1329424381256104, "learning_rate": 8.89251934475432e-06, "loss": 0.2232, "num_input_tokens_seen": 32039232, "step": 31840 }, { "epoch": 15.014144271570014, "grad_norm": 0.008526121266186237, "learning_rate": 8.884654244631857e-06, "loss": 0.0256, "num_input_tokens_seen": 32043808, "step": 31845 }, { "epoch": 15.016501650165017, "grad_norm": 1.8570871353149414, "learning_rate": 8.876791872563808e-06, "loss": 0.0874, "num_input_tokens_seen": 32048768, "step": 31850 }, { "epoch": 15.01885902876002, "grad_norm": 0.5382982492446899, "learning_rate": 8.868932229881142e-06, "loss": 0.058, "num_input_tokens_seen": 32053824, "step": 31855 }, { "epoch": 15.021216407355022, "grad_norm": 0.39118802547454834, "learning_rate": 8.861075317914356e-06, "loss": 0.1313, "num_input_tokens_seen": 32059584, "step": 31860 }, { "epoch": 15.023573785950024, "grad_norm": 1.8385231494903564, "learning_rate": 8.853221137993494e-06, "loss": 0.0827, "num_input_tokens_seen": 32063680, "step": 31865 }, { "epoch": 15.025931164545026, "grad_norm": 0.15216410160064697, "learning_rate": 8.845369691448138e-06, "loss": 0.0654, "num_input_tokens_seen": 32068416, "step": 31870 }, { "epoch": 15.028288543140029, "grad_norm": 0.3788174092769623, "learning_rate": 8.837520979607403e-06, "loss": 0.0661, "num_input_tokens_seen": 32072736, "step": 31875 }, { "epoch": 15.030645921735031, "grad_norm": 0.0598321296274662, "learning_rate": 8.829675003799954e-06, "loss": 0.0145, "num_input_tokens_seen": 32077088, "step": 31880 }, { "epoch": 15.033003300330034, "grad_norm": 1.3967313766479492, "learning_rate": 8.821831765353957e-06, "loss": 0.2106, "num_input_tokens_seen": 32083392, "step": 31885 }, { "epoch": 15.035360678925036, "grad_norm": 0.14148090779781342, "learning_rate": 8.813991265597158e-06, "loss": 0.1773, "num_input_tokens_seen": 32089312, "step": 31890 }, { "epoch": 15.037718057520038, "grad_norm": 1.9538294076919556, "learning_rate": 8.806153505856813e-06, "loss": 0.2276, "num_input_tokens_seen": 32094496, "step": 31895 }, { "epoch": 15.04007543611504, "grad_norm": 0.13010907173156738, "learning_rate": 8.79831848745972e-06, "loss": 0.0519, "num_input_tokens_seen": 32098464, "step": 31900 }, { "epoch": 15.042432814710043, "grad_norm": 1.3873134851455688, "learning_rate": 8.790486211732221e-06, "loss": 0.1716, "num_input_tokens_seen": 32103168, "step": 31905 }, { "epoch": 15.044790193305046, "grad_norm": 2.1621978282928467, "learning_rate": 8.782656680000192e-06, "loss": 0.1739, "num_input_tokens_seen": 32107360, "step": 31910 }, { "epoch": 15.047147571900048, "grad_norm": 0.027856340631842613, "learning_rate": 8.774829893589026e-06, "loss": 0.098, "num_input_tokens_seen": 32111648, "step": 31915 }, { "epoch": 15.049504950495049, "grad_norm": 1.8522694110870361, "learning_rate": 8.767005853823671e-06, "loss": 0.1423, "num_input_tokens_seen": 32115648, "step": 31920 }, { "epoch": 15.051862329090051, "grad_norm": 1.1961381435394287, "learning_rate": 8.759184562028602e-06, "loss": 0.1309, "num_input_tokens_seen": 32120672, "step": 31925 }, { "epoch": 15.054219707685053, "grad_norm": 0.5180471539497375, "learning_rate": 8.751366019527842e-06, "loss": 0.1759, "num_input_tokens_seen": 32125536, "step": 31930 }, { "epoch": 15.056577086280056, "grad_norm": 0.649623453617096, "learning_rate": 8.743550227644923e-06, "loss": 0.0711, "num_input_tokens_seen": 32131104, "step": 31935 }, { "epoch": 15.058934464875058, "grad_norm": 0.16703112423419952, "learning_rate": 8.735737187702931e-06, "loss": 0.0172, "num_input_tokens_seen": 32136256, "step": 31940 }, { "epoch": 15.06129184347006, "grad_norm": 2.3640804290771484, "learning_rate": 8.72792690102448e-06, "loss": 0.0865, "num_input_tokens_seen": 32141024, "step": 31945 }, { "epoch": 15.063649222065063, "grad_norm": 0.13328170776367188, "learning_rate": 8.720119368931722e-06, "loss": 0.0941, "num_input_tokens_seen": 32146336, "step": 31950 }, { "epoch": 15.066006600660065, "grad_norm": 1.3920457363128662, "learning_rate": 8.712314592746338e-06, "loss": 0.0613, "num_input_tokens_seen": 32151040, "step": 31955 }, { "epoch": 15.068363979255068, "grad_norm": 1.4762625694274902, "learning_rate": 8.704512573789545e-06, "loss": 0.0623, "num_input_tokens_seen": 32155328, "step": 31960 }, { "epoch": 15.07072135785007, "grad_norm": 0.5854455232620239, "learning_rate": 8.696713313382089e-06, "loss": 0.1134, "num_input_tokens_seen": 32160544, "step": 31965 }, { "epoch": 15.073078736445073, "grad_norm": 0.04418397322297096, "learning_rate": 8.688916812844264e-06, "loss": 0.1052, "num_input_tokens_seen": 32164608, "step": 31970 }, { "epoch": 15.075436115040075, "grad_norm": 2.346679449081421, "learning_rate": 8.68112307349587e-06, "loss": 0.0937, "num_input_tokens_seen": 32170208, "step": 31975 }, { "epoch": 15.077793493635077, "grad_norm": 0.2658904492855072, "learning_rate": 8.673332096656259e-06, "loss": 0.02, "num_input_tokens_seen": 32175200, "step": 31980 }, { "epoch": 15.08015087223008, "grad_norm": 0.18160031735897064, "learning_rate": 8.665543883644315e-06, "loss": 0.0405, "num_input_tokens_seen": 32179776, "step": 31985 }, { "epoch": 15.082508250825082, "grad_norm": 0.4544573724269867, "learning_rate": 8.657758435778447e-06, "loss": 0.1371, "num_input_tokens_seen": 32184640, "step": 31990 }, { "epoch": 15.084865629420085, "grad_norm": 0.623639702796936, "learning_rate": 8.6499757543766e-06, "loss": 0.0648, "num_input_tokens_seen": 32190592, "step": 31995 }, { "epoch": 15.087223008015087, "grad_norm": 2.1553375720977783, "learning_rate": 8.642195840756248e-06, "loss": 0.0802, "num_input_tokens_seen": 32195296, "step": 32000 }, { "epoch": 15.08958038661009, "grad_norm": 0.10874441266059875, "learning_rate": 8.634418696234404e-06, "loss": 0.086, "num_input_tokens_seen": 32200896, "step": 32005 }, { "epoch": 15.091937765205092, "grad_norm": 1.1040610074996948, "learning_rate": 8.62664432212761e-06, "loss": 0.2447, "num_input_tokens_seen": 32207904, "step": 32010 }, { "epoch": 15.094295143800094, "grad_norm": 0.18917180597782135, "learning_rate": 8.61887271975192e-06, "loss": 0.0892, "num_input_tokens_seen": 32213504, "step": 32015 }, { "epoch": 15.096652522395097, "grad_norm": 0.5898904800415039, "learning_rate": 8.611103890422941e-06, "loss": 0.1248, "num_input_tokens_seen": 32218976, "step": 32020 }, { "epoch": 15.099009900990099, "grad_norm": 0.6912766098976135, "learning_rate": 8.60333783545581e-06, "loss": 0.0618, "num_input_tokens_seen": 32222816, "step": 32025 }, { "epoch": 15.101367279585101, "grad_norm": 1.7313510179519653, "learning_rate": 8.59557455616518e-06, "loss": 0.1798, "num_input_tokens_seen": 32229632, "step": 32030 }, { "epoch": 15.103724658180104, "grad_norm": 0.3979628384113312, "learning_rate": 8.58781405386525e-06, "loss": 0.0519, "num_input_tokens_seen": 32234656, "step": 32035 }, { "epoch": 15.106082036775106, "grad_norm": 0.7486695647239685, "learning_rate": 8.580056329869735e-06, "loss": 0.0846, "num_input_tokens_seen": 32240416, "step": 32040 }, { "epoch": 15.108439415370109, "grad_norm": 0.27915793657302856, "learning_rate": 8.572301385491891e-06, "loss": 0.1798, "num_input_tokens_seen": 32244672, "step": 32045 }, { "epoch": 15.110796793965111, "grad_norm": 1.5732057094573975, "learning_rate": 8.564549222044501e-06, "loss": 0.2242, "num_input_tokens_seen": 32250624, "step": 32050 }, { "epoch": 15.113154172560114, "grad_norm": 1.3608191013336182, "learning_rate": 8.556799840839864e-06, "loss": 0.1073, "num_input_tokens_seen": 32255968, "step": 32055 }, { "epoch": 15.115511551155116, "grad_norm": 0.32991936802864075, "learning_rate": 8.549053243189823e-06, "loss": 0.0423, "num_input_tokens_seen": 32261248, "step": 32060 }, { "epoch": 15.117868929750118, "grad_norm": 0.206600621342659, "learning_rate": 8.541309430405747e-06, "loss": 0.0498, "num_input_tokens_seen": 32266560, "step": 32065 }, { "epoch": 15.12022630834512, "grad_norm": 0.21659526228904724, "learning_rate": 8.53356840379853e-06, "loss": 0.0387, "num_input_tokens_seen": 32271328, "step": 32070 }, { "epoch": 15.122583686940123, "grad_norm": 0.02871434949338436, "learning_rate": 8.525830164678603e-06, "loss": 0.035, "num_input_tokens_seen": 32275616, "step": 32075 }, { "epoch": 15.124941065535126, "grad_norm": 0.14094127714633942, "learning_rate": 8.51809471435591e-06, "loss": 0.1161, "num_input_tokens_seen": 32280768, "step": 32080 }, { "epoch": 15.127298444130128, "grad_norm": 0.7347190380096436, "learning_rate": 8.510362054139937e-06, "loss": 0.1073, "num_input_tokens_seen": 32286272, "step": 32085 }, { "epoch": 15.12965582272513, "grad_norm": 0.08361443877220154, "learning_rate": 8.502632185339696e-06, "loss": 0.1135, "num_input_tokens_seen": 32292832, "step": 32090 }, { "epoch": 15.132013201320133, "grad_norm": 1.0804132223129272, "learning_rate": 8.494905109263709e-06, "loss": 0.0509, "num_input_tokens_seen": 32298592, "step": 32095 }, { "epoch": 15.134370579915135, "grad_norm": 0.1548599898815155, "learning_rate": 8.487180827220045e-06, "loss": 0.0668, "num_input_tokens_seen": 32304032, "step": 32100 }, { "epoch": 15.136727958510138, "grad_norm": 2.0301740169525146, "learning_rate": 8.4794593405163e-06, "loss": 0.1326, "num_input_tokens_seen": 32309216, "step": 32105 }, { "epoch": 15.13908533710514, "grad_norm": 2.9370222091674805, "learning_rate": 8.47174065045958e-06, "loss": 0.0885, "num_input_tokens_seen": 32313312, "step": 32110 }, { "epoch": 15.14144271570014, "grad_norm": 0.22986091673374176, "learning_rate": 8.464024758356539e-06, "loss": 0.1648, "num_input_tokens_seen": 32318048, "step": 32115 }, { "epoch": 15.143800094295143, "grad_norm": 1.4016557931900024, "learning_rate": 8.456311665513338e-06, "loss": 0.081, "num_input_tokens_seen": 32322208, "step": 32120 }, { "epoch": 15.146157472890145, "grad_norm": 0.21920284628868103, "learning_rate": 8.448601373235678e-06, "loss": 0.2181, "num_input_tokens_seen": 32326944, "step": 32125 }, { "epoch": 15.148514851485148, "grad_norm": 0.27820613980293274, "learning_rate": 8.440893882828788e-06, "loss": 0.2167, "num_input_tokens_seen": 32331168, "step": 32130 }, { "epoch": 15.15087223008015, "grad_norm": 1.171427845954895, "learning_rate": 8.433189195597396e-06, "loss": 0.0832, "num_input_tokens_seen": 32337568, "step": 32135 }, { "epoch": 15.153229608675153, "grad_norm": 0.9740849137306213, "learning_rate": 8.425487312845792e-06, "loss": 0.0859, "num_input_tokens_seen": 32342176, "step": 32140 }, { "epoch": 15.155586987270155, "grad_norm": 0.17595092952251434, "learning_rate": 8.41778823587776e-06, "loss": 0.0515, "num_input_tokens_seen": 32347072, "step": 32145 }, { "epoch": 15.157944365865157, "grad_norm": 1.499386191368103, "learning_rate": 8.41009196599663e-06, "loss": 0.05, "num_input_tokens_seen": 32351392, "step": 32150 }, { "epoch": 15.16030174446016, "grad_norm": 0.686860978603363, "learning_rate": 8.402398504505252e-06, "loss": 0.0473, "num_input_tokens_seen": 32356160, "step": 32155 }, { "epoch": 15.162659123055162, "grad_norm": 3.26220440864563, "learning_rate": 8.394707852705991e-06, "loss": 0.0974, "num_input_tokens_seen": 32360896, "step": 32160 }, { "epoch": 15.165016501650165, "grad_norm": 0.09054747223854065, "learning_rate": 8.387020011900751e-06, "loss": 0.0648, "num_input_tokens_seen": 32365152, "step": 32165 }, { "epoch": 15.167373880245167, "grad_norm": 0.01925698295235634, "learning_rate": 8.379334983390951e-06, "loss": 0.0177, "num_input_tokens_seen": 32370848, "step": 32170 }, { "epoch": 15.16973125884017, "grad_norm": 0.3920444846153259, "learning_rate": 8.371652768477536e-06, "loss": 0.0409, "num_input_tokens_seen": 32376672, "step": 32175 }, { "epoch": 15.172088637435172, "grad_norm": 0.641478955745697, "learning_rate": 8.363973368460981e-06, "loss": 0.019, "num_input_tokens_seen": 32381120, "step": 32180 }, { "epoch": 15.174446016030174, "grad_norm": 0.06828879565000534, "learning_rate": 8.356296784641263e-06, "loss": 0.1008, "num_input_tokens_seen": 32387520, "step": 32185 }, { "epoch": 15.176803394625177, "grad_norm": 0.38596805930137634, "learning_rate": 8.348623018317905e-06, "loss": 0.1689, "num_input_tokens_seen": 32392608, "step": 32190 }, { "epoch": 15.179160773220179, "grad_norm": 0.3811119496822357, "learning_rate": 8.340952070789946e-06, "loss": 0.1095, "num_input_tokens_seen": 32397056, "step": 32195 }, { "epoch": 15.181518151815181, "grad_norm": 0.4177309572696686, "learning_rate": 8.333283943355945e-06, "loss": 0.1001, "num_input_tokens_seen": 32402048, "step": 32200 }, { "epoch": 15.183875530410184, "grad_norm": 0.7596080303192139, "learning_rate": 8.325618637313993e-06, "loss": 0.0975, "num_input_tokens_seen": 32409440, "step": 32205 }, { "epoch": 15.186232909005186, "grad_norm": 1.4333469867706299, "learning_rate": 8.317956153961688e-06, "loss": 0.1265, "num_input_tokens_seen": 32414944, "step": 32210 }, { "epoch": 15.188590287600189, "grad_norm": 0.02999374456703663, "learning_rate": 8.31029649459616e-06, "loss": 0.0212, "num_input_tokens_seen": 32420544, "step": 32215 }, { "epoch": 15.190947666195191, "grad_norm": 0.10383139550685883, "learning_rate": 8.302639660514069e-06, "loss": 0.0149, "num_input_tokens_seen": 32425984, "step": 32220 }, { "epoch": 15.193305044790193, "grad_norm": 0.15300871431827545, "learning_rate": 8.29498565301157e-06, "loss": 0.0241, "num_input_tokens_seen": 32431680, "step": 32225 }, { "epoch": 15.195662423385196, "grad_norm": 0.09199803322553635, "learning_rate": 8.287334473384367e-06, "loss": 0.1244, "num_input_tokens_seen": 32436448, "step": 32230 }, { "epoch": 15.198019801980198, "grad_norm": 1.4996038675308228, "learning_rate": 8.27968612292767e-06, "loss": 0.0934, "num_input_tokens_seen": 32442976, "step": 32235 }, { "epoch": 15.2003771805752, "grad_norm": 2.953301191329956, "learning_rate": 8.272040602936218e-06, "loss": 0.0682, "num_input_tokens_seen": 32447840, "step": 32240 }, { "epoch": 15.202734559170203, "grad_norm": 0.45158877968788147, "learning_rate": 8.26439791470427e-06, "loss": 0.2543, "num_input_tokens_seen": 32452352, "step": 32245 }, { "epoch": 15.205091937765205, "grad_norm": 0.05961398035287857, "learning_rate": 8.256758059525598e-06, "loss": 0.018, "num_input_tokens_seen": 32456448, "step": 32250 }, { "epoch": 15.207449316360208, "grad_norm": 0.01912752166390419, "learning_rate": 8.249121038693505e-06, "loss": 0.0592, "num_input_tokens_seen": 32460544, "step": 32255 }, { "epoch": 15.20980669495521, "grad_norm": 0.47537294030189514, "learning_rate": 8.241486853500814e-06, "loss": 0.0376, "num_input_tokens_seen": 32465728, "step": 32260 }, { "epoch": 15.212164073550213, "grad_norm": 0.2532842457294464, "learning_rate": 8.233855505239843e-06, "loss": 0.1425, "num_input_tokens_seen": 32471200, "step": 32265 }, { "epoch": 15.214521452145215, "grad_norm": 0.6570518016815186, "learning_rate": 8.226226995202466e-06, "loss": 0.1038, "num_input_tokens_seen": 32476000, "step": 32270 }, { "epoch": 15.216878830740217, "grad_norm": 0.1717565804719925, "learning_rate": 8.218601324680055e-06, "loss": 0.0984, "num_input_tokens_seen": 32481056, "step": 32275 }, { "epoch": 15.21923620933522, "grad_norm": 0.014960729517042637, "learning_rate": 8.21097849496351e-06, "loss": 0.0558, "num_input_tokens_seen": 32486240, "step": 32280 }, { "epoch": 15.221593587930222, "grad_norm": 0.9474185705184937, "learning_rate": 8.203358507343242e-06, "loss": 0.1523, "num_input_tokens_seen": 32491072, "step": 32285 }, { "epoch": 15.223950966525225, "grad_norm": 0.4098374843597412, "learning_rate": 8.195741363109188e-06, "loss": 0.0548, "num_input_tokens_seen": 32495232, "step": 32290 }, { "epoch": 15.226308345120227, "grad_norm": 1.4964897632598877, "learning_rate": 8.188127063550801e-06, "loss": 0.2658, "num_input_tokens_seen": 32499936, "step": 32295 }, { "epoch": 15.22866572371523, "grad_norm": 0.049229394644498825, "learning_rate": 8.180515609957062e-06, "loss": 0.0368, "num_input_tokens_seen": 32504768, "step": 32300 }, { "epoch": 15.231023102310232, "grad_norm": 0.571354866027832, "learning_rate": 8.172907003616445e-06, "loss": 0.1082, "num_input_tokens_seen": 32510208, "step": 32305 }, { "epoch": 15.233380480905234, "grad_norm": 0.1750737726688385, "learning_rate": 8.165301245816961e-06, "loss": 0.1645, "num_input_tokens_seen": 32515712, "step": 32310 }, { "epoch": 15.235737859500237, "grad_norm": 0.3794216215610504, "learning_rate": 8.157698337846143e-06, "loss": 0.1203, "num_input_tokens_seen": 32520096, "step": 32315 }, { "epoch": 15.238095238095237, "grad_norm": 0.04375835135579109, "learning_rate": 8.150098280991028e-06, "loss": 0.0499, "num_input_tokens_seen": 32524512, "step": 32320 }, { "epoch": 15.24045261669024, "grad_norm": 0.5678653120994568, "learning_rate": 8.142501076538178e-06, "loss": 0.0184, "num_input_tokens_seen": 32529408, "step": 32325 }, { "epoch": 15.242809995285242, "grad_norm": 0.022094743326306343, "learning_rate": 8.134906725773674e-06, "loss": 0.0584, "num_input_tokens_seen": 32534016, "step": 32330 }, { "epoch": 15.245167373880244, "grad_norm": 0.46356910467147827, "learning_rate": 8.127315229983106e-06, "loss": 0.1195, "num_input_tokens_seen": 32539200, "step": 32335 }, { "epoch": 15.247524752475247, "grad_norm": 0.35528451204299927, "learning_rate": 8.119726590451598e-06, "loss": 0.0269, "num_input_tokens_seen": 32543552, "step": 32340 }, { "epoch": 15.24988213107025, "grad_norm": 0.14350764453411102, "learning_rate": 8.112140808463759e-06, "loss": 0.1024, "num_input_tokens_seen": 32548864, "step": 32345 }, { "epoch": 15.252239509665252, "grad_norm": 0.05788083001971245, "learning_rate": 8.10455788530374e-06, "loss": 0.1331, "num_input_tokens_seen": 32552736, "step": 32350 }, { "epoch": 15.254596888260254, "grad_norm": 1.7708417177200317, "learning_rate": 8.096977822255205e-06, "loss": 0.1673, "num_input_tokens_seen": 32558368, "step": 32355 }, { "epoch": 15.256954266855256, "grad_norm": 0.11317821592092514, "learning_rate": 8.089400620601334e-06, "loss": 0.1286, "num_input_tokens_seen": 32563680, "step": 32360 }, { "epoch": 15.259311645450259, "grad_norm": 0.05738271400332451, "learning_rate": 8.081826281624805e-06, "loss": 0.0252, "num_input_tokens_seen": 32568448, "step": 32365 }, { "epoch": 15.261669024045261, "grad_norm": 1.539385199546814, "learning_rate": 8.074254806607833e-06, "loss": 0.2125, "num_input_tokens_seen": 32573824, "step": 32370 }, { "epoch": 15.264026402640264, "grad_norm": 0.13566382229328156, "learning_rate": 8.066686196832138e-06, "loss": 0.171, "num_input_tokens_seen": 32579072, "step": 32375 }, { "epoch": 15.266383781235266, "grad_norm": 0.19451351463794708, "learning_rate": 8.05912045357896e-06, "loss": 0.1335, "num_input_tokens_seen": 32583712, "step": 32380 }, { "epoch": 15.268741159830268, "grad_norm": 0.026279907673597336, "learning_rate": 8.051557578129057e-06, "loss": 0.0583, "num_input_tokens_seen": 32588704, "step": 32385 }, { "epoch": 15.27109853842527, "grad_norm": 0.24987401068210602, "learning_rate": 8.043997571762679e-06, "loss": 0.097, "num_input_tokens_seen": 32593376, "step": 32390 }, { "epoch": 15.273455917020273, "grad_norm": 2.6632184982299805, "learning_rate": 8.036440435759616e-06, "loss": 0.1247, "num_input_tokens_seen": 32598016, "step": 32395 }, { "epoch": 15.275813295615276, "grad_norm": 0.11836647987365723, "learning_rate": 8.028886171399164e-06, "loss": 0.0801, "num_input_tokens_seen": 32605280, "step": 32400 }, { "epoch": 15.278170674210278, "grad_norm": 1.3006439208984375, "learning_rate": 8.021334779960127e-06, "loss": 0.0915, "num_input_tokens_seen": 32610304, "step": 32405 }, { "epoch": 15.28052805280528, "grad_norm": 0.0900958701968193, "learning_rate": 8.013786262720832e-06, "loss": 0.0503, "num_input_tokens_seen": 32615232, "step": 32410 }, { "epoch": 15.282885431400283, "grad_norm": 0.7686972618103027, "learning_rate": 8.006240620959112e-06, "loss": 0.1012, "num_input_tokens_seen": 32619712, "step": 32415 }, { "epoch": 15.285242809995285, "grad_norm": 0.10357806086540222, "learning_rate": 7.99869785595232e-06, "loss": 0.0537, "num_input_tokens_seen": 32624448, "step": 32420 }, { "epoch": 15.287600188590288, "grad_norm": 1.0726439952850342, "learning_rate": 7.991157968977323e-06, "loss": 0.2084, "num_input_tokens_seen": 32629152, "step": 32425 }, { "epoch": 15.28995756718529, "grad_norm": 2.071716547012329, "learning_rate": 7.983620961310478e-06, "loss": 0.1668, "num_input_tokens_seen": 32634816, "step": 32430 }, { "epoch": 15.292314945780292, "grad_norm": 0.01407082099467516, "learning_rate": 7.976086834227684e-06, "loss": 0.2493, "num_input_tokens_seen": 32639744, "step": 32435 }, { "epoch": 15.294672324375295, "grad_norm": 0.15385839343070984, "learning_rate": 7.96855558900434e-06, "loss": 0.02, "num_input_tokens_seen": 32643968, "step": 32440 }, { "epoch": 15.297029702970297, "grad_norm": 0.22619450092315674, "learning_rate": 7.961027226915355e-06, "loss": 0.0422, "num_input_tokens_seen": 32649120, "step": 32445 }, { "epoch": 15.2993870815653, "grad_norm": 1.3703453540802002, "learning_rate": 7.953501749235157e-06, "loss": 0.046, "num_input_tokens_seen": 32653088, "step": 32450 }, { "epoch": 15.301744460160302, "grad_norm": 0.2665848433971405, "learning_rate": 7.94597915723768e-06, "loss": 0.1137, "num_input_tokens_seen": 32658176, "step": 32455 }, { "epoch": 15.304101838755304, "grad_norm": 1.718353033065796, "learning_rate": 7.938459452196367e-06, "loss": 0.047, "num_input_tokens_seen": 32663232, "step": 32460 }, { "epoch": 15.306459217350307, "grad_norm": 0.9738874435424805, "learning_rate": 7.930942635384192e-06, "loss": 0.0394, "num_input_tokens_seen": 32670272, "step": 32465 }, { "epoch": 15.30881659594531, "grad_norm": 1.2268880605697632, "learning_rate": 7.923428708073605e-06, "loss": 0.2586, "num_input_tokens_seen": 32674656, "step": 32470 }, { "epoch": 15.311173974540312, "grad_norm": 0.12222179025411606, "learning_rate": 7.915917671536594e-06, "loss": 0.0827, "num_input_tokens_seen": 32683008, "step": 32475 }, { "epoch": 15.313531353135314, "grad_norm": 1.2063390016555786, "learning_rate": 7.908409527044647e-06, "loss": 0.1303, "num_input_tokens_seen": 32687776, "step": 32480 }, { "epoch": 15.315888731730317, "grad_norm": 0.6940265893936157, "learning_rate": 7.90090427586877e-06, "loss": 0.0942, "num_input_tokens_seen": 32692576, "step": 32485 }, { "epoch": 15.318246110325319, "grad_norm": 0.2060505896806717, "learning_rate": 7.893401919279475e-06, "loss": 0.0258, "num_input_tokens_seen": 32697376, "step": 32490 }, { "epoch": 15.320603488920321, "grad_norm": 0.2705741822719574, "learning_rate": 7.885902458546779e-06, "loss": 0.214, "num_input_tokens_seen": 32702432, "step": 32495 }, { "epoch": 15.322960867515324, "grad_norm": 0.49900662899017334, "learning_rate": 7.878405894940219e-06, "loss": 0.0665, "num_input_tokens_seen": 32706944, "step": 32500 }, { "epoch": 15.325318246110326, "grad_norm": 1.0454087257385254, "learning_rate": 7.870912229728838e-06, "loss": 0.0422, "num_input_tokens_seen": 32711264, "step": 32505 }, { "epoch": 15.327675624705329, "grad_norm": 0.26497316360473633, "learning_rate": 7.863421464181173e-06, "loss": 0.1496, "num_input_tokens_seen": 32715744, "step": 32510 }, { "epoch": 15.33003300330033, "grad_norm": 1.5043303966522217, "learning_rate": 7.855933599565293e-06, "loss": 0.1499, "num_input_tokens_seen": 32721248, "step": 32515 }, { "epoch": 15.332390381895332, "grad_norm": 1.1004726886749268, "learning_rate": 7.848448637148765e-06, "loss": 0.1251, "num_input_tokens_seen": 32725888, "step": 32520 }, { "epoch": 15.334747760490334, "grad_norm": 2.9380712509155273, "learning_rate": 7.840966578198666e-06, "loss": 0.149, "num_input_tokens_seen": 32730944, "step": 32525 }, { "epoch": 15.337105139085336, "grad_norm": 0.8425098657608032, "learning_rate": 7.833487423981581e-06, "loss": 0.0436, "num_input_tokens_seen": 32735840, "step": 32530 }, { "epoch": 15.339462517680339, "grad_norm": 0.13266372680664062, "learning_rate": 7.826011175763603e-06, "loss": 0.0107, "num_input_tokens_seen": 32740000, "step": 32535 }, { "epoch": 15.341819896275341, "grad_norm": 0.0845058262348175, "learning_rate": 7.818537834810339e-06, "loss": 0.0612, "num_input_tokens_seen": 32746400, "step": 32540 }, { "epoch": 15.344177274870344, "grad_norm": 0.44231343269348145, "learning_rate": 7.811067402386898e-06, "loss": 0.2102, "num_input_tokens_seen": 32751840, "step": 32545 }, { "epoch": 15.346534653465346, "grad_norm": 0.30400580167770386, "learning_rate": 7.803599879757889e-06, "loss": 0.0331, "num_input_tokens_seen": 32757504, "step": 32550 }, { "epoch": 15.348892032060348, "grad_norm": 0.4239334166049957, "learning_rate": 7.796135268187444e-06, "loss": 0.0549, "num_input_tokens_seen": 32761824, "step": 32555 }, { "epoch": 15.35124941065535, "grad_norm": 0.5371168255805969, "learning_rate": 7.78867356893919e-06, "loss": 0.1041, "num_input_tokens_seen": 32766528, "step": 32560 }, { "epoch": 15.353606789250353, "grad_norm": 0.6089521050453186, "learning_rate": 7.78121478327627e-06, "loss": 0.0916, "num_input_tokens_seen": 32771712, "step": 32565 }, { "epoch": 15.355964167845356, "grad_norm": 0.26700615882873535, "learning_rate": 7.77375891246133e-06, "loss": 0.073, "num_input_tokens_seen": 32776192, "step": 32570 }, { "epoch": 15.358321546440358, "grad_norm": 0.06337504833936691, "learning_rate": 7.766305957756517e-06, "loss": 0.0598, "num_input_tokens_seen": 32780288, "step": 32575 }, { "epoch": 15.36067892503536, "grad_norm": 2.164842367172241, "learning_rate": 7.7588559204235e-06, "loss": 0.1591, "num_input_tokens_seen": 32784992, "step": 32580 }, { "epoch": 15.363036303630363, "grad_norm": 0.14023225009441376, "learning_rate": 7.751408801723442e-06, "loss": 0.2084, "num_input_tokens_seen": 32789664, "step": 32585 }, { "epoch": 15.365393682225365, "grad_norm": 1.5585439205169678, "learning_rate": 7.743964602917e-06, "loss": 0.0972, "num_input_tokens_seen": 32794400, "step": 32590 }, { "epoch": 15.367751060820368, "grad_norm": 0.25697726011276245, "learning_rate": 7.736523325264373e-06, "loss": 0.0693, "num_input_tokens_seen": 32800192, "step": 32595 }, { "epoch": 15.37010843941537, "grad_norm": 1.0259077548980713, "learning_rate": 7.72908497002522e-06, "loss": 0.0845, "num_input_tokens_seen": 32804832, "step": 32600 }, { "epoch": 15.372465818010372, "grad_norm": 1.1762642860412598, "learning_rate": 7.721649538458734e-06, "loss": 0.1317, "num_input_tokens_seen": 32809248, "step": 32605 }, { "epoch": 15.374823196605375, "grad_norm": 0.07593444734811783, "learning_rate": 7.714217031823615e-06, "loss": 0.1018, "num_input_tokens_seen": 32814208, "step": 32610 }, { "epoch": 15.377180575200377, "grad_norm": 0.06466738879680634, "learning_rate": 7.706787451378055e-06, "loss": 0.0216, "num_input_tokens_seen": 32820256, "step": 32615 }, { "epoch": 15.37953795379538, "grad_norm": 0.10438031703233719, "learning_rate": 7.69936079837976e-06, "loss": 0.1755, "num_input_tokens_seen": 32824640, "step": 32620 }, { "epoch": 15.381895332390382, "grad_norm": 0.051410991698503494, "learning_rate": 7.691937074085934e-06, "loss": 0.0968, "num_input_tokens_seen": 32829888, "step": 32625 }, { "epoch": 15.384252710985384, "grad_norm": 1.7500419616699219, "learning_rate": 7.684516279753284e-06, "loss": 0.2201, "num_input_tokens_seen": 32833792, "step": 32630 }, { "epoch": 15.386610089580387, "grad_norm": 0.5468211770057678, "learning_rate": 7.677098416638037e-06, "loss": 0.0806, "num_input_tokens_seen": 32839264, "step": 32635 }, { "epoch": 15.38896746817539, "grad_norm": 0.37834444642066956, "learning_rate": 7.669683485995893e-06, "loss": 0.0863, "num_input_tokens_seen": 32845280, "step": 32640 }, { "epoch": 15.391324846770392, "grad_norm": 0.021134410053491592, "learning_rate": 7.662271489082084e-06, "loss": 0.0311, "num_input_tokens_seen": 32849728, "step": 32645 }, { "epoch": 15.393682225365394, "grad_norm": 2.203665256500244, "learning_rate": 7.654862427151336e-06, "loss": 0.2215, "num_input_tokens_seen": 32854336, "step": 32650 }, { "epoch": 15.396039603960396, "grad_norm": 0.47130846977233887, "learning_rate": 7.647456301457873e-06, "loss": 0.0737, "num_input_tokens_seen": 32859008, "step": 32655 }, { "epoch": 15.398396982555399, "grad_norm": 0.32997018098831177, "learning_rate": 7.640053113255427e-06, "loss": 0.2301, "num_input_tokens_seen": 32863808, "step": 32660 }, { "epoch": 15.400754361150401, "grad_norm": 0.39036813378334045, "learning_rate": 7.632652863797238e-06, "loss": 0.0673, "num_input_tokens_seen": 32868320, "step": 32665 }, { "epoch": 15.403111739745404, "grad_norm": 0.4576556384563446, "learning_rate": 7.625255554336036e-06, "loss": 0.1419, "num_input_tokens_seen": 32874368, "step": 32670 }, { "epoch": 15.405469118340406, "grad_norm": 0.8615638613700867, "learning_rate": 7.61786118612407e-06, "loss": 0.0873, "num_input_tokens_seen": 32879264, "step": 32675 }, { "epoch": 15.407826496935408, "grad_norm": 0.38522419333457947, "learning_rate": 7.6104697604130664e-06, "loss": 0.0284, "num_input_tokens_seen": 32883840, "step": 32680 }, { "epoch": 15.41018387553041, "grad_norm": 0.03801531717181206, "learning_rate": 7.603081278454274e-06, "loss": 0.0641, "num_input_tokens_seen": 32889312, "step": 32685 }, { "epoch": 15.412541254125413, "grad_norm": 1.306681752204895, "learning_rate": 7.595695741498438e-06, "loss": 0.3549, "num_input_tokens_seen": 32893888, "step": 32690 }, { "epoch": 15.414898632720416, "grad_norm": 0.041020993143320084, "learning_rate": 7.588313150795806e-06, "loss": 0.0475, "num_input_tokens_seen": 32899488, "step": 32695 }, { "epoch": 15.417256011315418, "grad_norm": 0.6953151822090149, "learning_rate": 7.580933507596124e-06, "loss": 0.0736, "num_input_tokens_seen": 32905056, "step": 32700 }, { "epoch": 15.41961338991042, "grad_norm": 0.20748773217201233, "learning_rate": 7.573556813148639e-06, "loss": 0.1387, "num_input_tokens_seen": 32910176, "step": 32705 }, { "epoch": 15.421970768505423, "grad_norm": 0.09281980991363525, "learning_rate": 7.566183068702101e-06, "loss": 0.0626, "num_input_tokens_seen": 32914944, "step": 32710 }, { "epoch": 15.424328147100425, "grad_norm": 1.6093519926071167, "learning_rate": 7.55881227550477e-06, "loss": 0.0476, "num_input_tokens_seen": 32919264, "step": 32715 }, { "epoch": 15.426685525695426, "grad_norm": 0.34545475244522095, "learning_rate": 7.5514444348043775e-06, "loss": 0.0592, "num_input_tokens_seen": 32924288, "step": 32720 }, { "epoch": 15.429042904290428, "grad_norm": 0.22029992938041687, "learning_rate": 7.5440795478481815e-06, "loss": 0.2539, "num_input_tokens_seen": 32929792, "step": 32725 }, { "epoch": 15.43140028288543, "grad_norm": 0.08326520770788193, "learning_rate": 7.536717615882935e-06, "loss": 0.1946, "num_input_tokens_seen": 32936032, "step": 32730 }, { "epoch": 15.433757661480433, "grad_norm": 0.9958723187446594, "learning_rate": 7.529358640154885e-06, "loss": 0.0733, "num_input_tokens_seen": 32940064, "step": 32735 }, { "epoch": 15.436115040075435, "grad_norm": 1.5205323696136475, "learning_rate": 7.522002621909785e-06, "loss": 0.2648, "num_input_tokens_seen": 32945824, "step": 32740 }, { "epoch": 15.438472418670438, "grad_norm": 1.6462351083755493, "learning_rate": 7.5146495623928785e-06, "loss": 0.124, "num_input_tokens_seen": 32950496, "step": 32745 }, { "epoch": 15.44082979726544, "grad_norm": 1.8278849124908447, "learning_rate": 7.5072994628489185e-06, "loss": 0.0958, "num_input_tokens_seen": 32955616, "step": 32750 }, { "epoch": 15.443187175860443, "grad_norm": 0.018702201545238495, "learning_rate": 7.499952324522158e-06, "loss": 0.2224, "num_input_tokens_seen": 32960384, "step": 32755 }, { "epoch": 15.445544554455445, "grad_norm": 0.15033632516860962, "learning_rate": 7.492608148656327e-06, "loss": 0.1918, "num_input_tokens_seen": 32967872, "step": 32760 }, { "epoch": 15.447901933050447, "grad_norm": 0.19343054294586182, "learning_rate": 7.485266936494678e-06, "loss": 0.0923, "num_input_tokens_seen": 32975872, "step": 32765 }, { "epoch": 15.45025931164545, "grad_norm": 0.07339876890182495, "learning_rate": 7.477928689279953e-06, "loss": 0.1048, "num_input_tokens_seen": 32980064, "step": 32770 }, { "epoch": 15.452616690240452, "grad_norm": 0.07379300147294998, "learning_rate": 7.470593408254395e-06, "loss": 0.1591, "num_input_tokens_seen": 32985312, "step": 32775 }, { "epoch": 15.454974068835455, "grad_norm": 1.714000940322876, "learning_rate": 7.463261094659738e-06, "loss": 0.1626, "num_input_tokens_seen": 32990368, "step": 32780 }, { "epoch": 15.457331447430457, "grad_norm": 0.1064937636256218, "learning_rate": 7.455931749737222e-06, "loss": 0.0119, "num_input_tokens_seen": 32995264, "step": 32785 }, { "epoch": 15.45968882602546, "grad_norm": 1.196959137916565, "learning_rate": 7.44860537472758e-06, "loss": 0.284, "num_input_tokens_seen": 33000992, "step": 32790 }, { "epoch": 15.462046204620462, "grad_norm": 0.6178011894226074, "learning_rate": 7.441281970871047e-06, "loss": 0.0597, "num_input_tokens_seen": 33005920, "step": 32795 }, { "epoch": 15.464403583215464, "grad_norm": 1.5741013288497925, "learning_rate": 7.433961539407341e-06, "loss": 0.1313, "num_input_tokens_seen": 33012192, "step": 32800 }, { "epoch": 15.466760961810467, "grad_norm": 0.17601065337657928, "learning_rate": 7.426644081575693e-06, "loss": 0.0531, "num_input_tokens_seen": 33016032, "step": 32805 }, { "epoch": 15.469118340405469, "grad_norm": 0.26031628251075745, "learning_rate": 7.41932959861483e-06, "loss": 0.1532, "num_input_tokens_seen": 33021760, "step": 32810 }, { "epoch": 15.471475719000471, "grad_norm": 0.16014012694358826, "learning_rate": 7.4120180917629565e-06, "loss": 0.136, "num_input_tokens_seen": 33026432, "step": 32815 }, { "epoch": 15.473833097595474, "grad_norm": 0.08376717567443848, "learning_rate": 7.404709562257791e-06, "loss": 0.0349, "num_input_tokens_seen": 33031360, "step": 32820 }, { "epoch": 15.476190476190476, "grad_norm": 2.269576072692871, "learning_rate": 7.397404011336548e-06, "loss": 0.1755, "num_input_tokens_seen": 33037664, "step": 32825 }, { "epoch": 15.478547854785479, "grad_norm": 0.8003389239311218, "learning_rate": 7.390101440235927e-06, "loss": 0.1368, "num_input_tokens_seen": 33042848, "step": 32830 }, { "epoch": 15.480905233380481, "grad_norm": 0.7831518054008484, "learning_rate": 7.382801850192136e-06, "loss": 0.07, "num_input_tokens_seen": 33049120, "step": 32835 }, { "epoch": 15.483262611975483, "grad_norm": 0.5739724040031433, "learning_rate": 7.37550524244087e-06, "loss": 0.0625, "num_input_tokens_seen": 33054176, "step": 32840 }, { "epoch": 15.485619990570486, "grad_norm": 2.4954099655151367, "learning_rate": 7.3682116182173245e-06, "loss": 0.3472, "num_input_tokens_seen": 33058368, "step": 32845 }, { "epoch": 15.487977369165488, "grad_norm": 0.07773856818675995, "learning_rate": 7.360920978756175e-06, "loss": 0.0957, "num_input_tokens_seen": 33063104, "step": 32850 }, { "epoch": 15.49033474776049, "grad_norm": 0.5671908259391785, "learning_rate": 7.353633325291609e-06, "loss": 0.1478, "num_input_tokens_seen": 33068320, "step": 32855 }, { "epoch": 15.492692126355493, "grad_norm": 0.6906540989875793, "learning_rate": 7.346348659057303e-06, "loss": 0.1138, "num_input_tokens_seen": 33072480, "step": 32860 }, { "epoch": 15.495049504950495, "grad_norm": 1.1789250373840332, "learning_rate": 7.339066981286427e-06, "loss": 0.1355, "num_input_tokens_seen": 33079168, "step": 32865 }, { "epoch": 15.497406883545498, "grad_norm": 0.03582534193992615, "learning_rate": 7.331788293211644e-06, "loss": 0.0436, "num_input_tokens_seen": 33083552, "step": 32870 }, { "epoch": 15.4997642621405, "grad_norm": 0.631868302822113, "learning_rate": 7.324512596065117e-06, "loss": 0.0724, "num_input_tokens_seen": 33088928, "step": 32875 }, { "epoch": 15.502121640735503, "grad_norm": 2.135772466659546, "learning_rate": 7.317239891078492e-06, "loss": 0.1387, "num_input_tokens_seen": 33094720, "step": 32880 }, { "epoch": 15.504479019330505, "grad_norm": 1.8334742784500122, "learning_rate": 7.309970179482925e-06, "loss": 0.2859, "num_input_tokens_seen": 33100000, "step": 32885 }, { "epoch": 15.506836397925507, "grad_norm": 0.1430538296699524, "learning_rate": 7.302703462509042e-06, "loss": 0.1679, "num_input_tokens_seen": 33105088, "step": 32890 }, { "epoch": 15.50919377652051, "grad_norm": 1.1140137910842896, "learning_rate": 7.295439741386978e-06, "loss": 0.0825, "num_input_tokens_seen": 33108672, "step": 32895 }, { "epoch": 15.511551155115512, "grad_norm": 1.076145052909851, "learning_rate": 7.288179017346358e-06, "loss": 0.2231, "num_input_tokens_seen": 33113120, "step": 32900 }, { "epoch": 15.513908533710515, "grad_norm": 0.11942669004201889, "learning_rate": 7.280921291616302e-06, "loss": 0.0245, "num_input_tokens_seen": 33118176, "step": 32905 }, { "epoch": 15.516265912305517, "grad_norm": 0.5922088027000427, "learning_rate": 7.273666565425419e-06, "loss": 0.0382, "num_input_tokens_seen": 33122784, "step": 32910 }, { "epoch": 15.518623290900518, "grad_norm": 0.3716523349285126, "learning_rate": 7.266414840001812e-06, "loss": 0.0641, "num_input_tokens_seen": 33128672, "step": 32915 }, { "epoch": 15.520980669495522, "grad_norm": 0.5075357556343079, "learning_rate": 7.259166116573071e-06, "loss": 0.0353, "num_input_tokens_seen": 33132736, "step": 32920 }, { "epoch": 15.523338048090523, "grad_norm": 1.5336716175079346, "learning_rate": 7.251920396366293e-06, "loss": 0.1627, "num_input_tokens_seen": 33137568, "step": 32925 }, { "epoch": 15.525695426685525, "grad_norm": 0.21631762385368347, "learning_rate": 7.244677680608039e-06, "loss": 0.0562, "num_input_tokens_seen": 33142368, "step": 32930 }, { "epoch": 15.528052805280527, "grad_norm": 2.2453360557556152, "learning_rate": 7.237437970524385e-06, "loss": 0.1226, "num_input_tokens_seen": 33146752, "step": 32935 }, { "epoch": 15.53041018387553, "grad_norm": 1.2147538661956787, "learning_rate": 7.230201267340891e-06, "loss": 0.1031, "num_input_tokens_seen": 33152384, "step": 32940 }, { "epoch": 15.532767562470532, "grad_norm": 0.4510137438774109, "learning_rate": 7.22296757228261e-06, "loss": 0.0889, "num_input_tokens_seen": 33157888, "step": 32945 }, { "epoch": 15.535124941065535, "grad_norm": 0.09811042994260788, "learning_rate": 7.215736886574079e-06, "loss": 0.1703, "num_input_tokens_seen": 33163648, "step": 32950 }, { "epoch": 15.537482319660537, "grad_norm": 0.26176249980926514, "learning_rate": 7.2085092114393375e-06, "loss": 0.137, "num_input_tokens_seen": 33167808, "step": 32955 }, { "epoch": 15.53983969825554, "grad_norm": 0.12320747971534729, "learning_rate": 7.2012845481019026e-06, "loss": 0.0857, "num_input_tokens_seen": 33172480, "step": 32960 }, { "epoch": 15.542197076850542, "grad_norm": 0.8297690749168396, "learning_rate": 7.194062897784795e-06, "loss": 0.1898, "num_input_tokens_seen": 33177056, "step": 32965 }, { "epoch": 15.544554455445544, "grad_norm": 1.5065356492996216, "learning_rate": 7.186844261710504e-06, "loss": 0.1189, "num_input_tokens_seen": 33182624, "step": 32970 }, { "epoch": 15.546911834040547, "grad_norm": 0.43391600251197815, "learning_rate": 7.17962864110103e-06, "loss": 0.0966, "num_input_tokens_seen": 33187552, "step": 32975 }, { "epoch": 15.549269212635549, "grad_norm": 2.3797717094421387, "learning_rate": 7.172416037177854e-06, "loss": 0.1348, "num_input_tokens_seen": 33192768, "step": 32980 }, { "epoch": 15.551626591230551, "grad_norm": 0.027550524100661278, "learning_rate": 7.165206451161949e-06, "loss": 0.024, "num_input_tokens_seen": 33196544, "step": 32985 }, { "epoch": 15.553983969825554, "grad_norm": 0.7905187606811523, "learning_rate": 7.157999884273772e-06, "loss": 0.0392, "num_input_tokens_seen": 33202016, "step": 32990 }, { "epoch": 15.556341348420556, "grad_norm": 0.022072937339544296, "learning_rate": 7.150796337733276e-06, "loss": 0.0569, "num_input_tokens_seen": 33206656, "step": 32995 }, { "epoch": 15.558698727015559, "grad_norm": 0.11811266839504242, "learning_rate": 7.143595812759899e-06, "loss": 0.0795, "num_input_tokens_seen": 33211872, "step": 33000 }, { "epoch": 15.561056105610561, "grad_norm": 0.28111910820007324, "learning_rate": 7.136398310572573e-06, "loss": 0.0396, "num_input_tokens_seen": 33216832, "step": 33005 }, { "epoch": 15.563413484205563, "grad_norm": 0.1808711141347885, "learning_rate": 7.1292038323897e-06, "loss": 0.0637, "num_input_tokens_seen": 33221664, "step": 33010 }, { "epoch": 15.565770862800566, "grad_norm": 0.017774660140275955, "learning_rate": 7.1220123794291855e-06, "loss": 0.1022, "num_input_tokens_seen": 33226848, "step": 33015 }, { "epoch": 15.568128241395568, "grad_norm": 0.4078778922557831, "learning_rate": 7.114823952908428e-06, "loss": 0.0422, "num_input_tokens_seen": 33232192, "step": 33020 }, { "epoch": 15.57048561999057, "grad_norm": 0.6291154026985168, "learning_rate": 7.107638554044302e-06, "loss": 0.1869, "num_input_tokens_seen": 33236800, "step": 33025 }, { "epoch": 15.572842998585573, "grad_norm": 0.5125588774681091, "learning_rate": 7.100456184053173e-06, "loss": 0.1591, "num_input_tokens_seen": 33242080, "step": 33030 }, { "epoch": 15.575200377180575, "grad_norm": 0.7103022336959839, "learning_rate": 7.0932768441509e-06, "loss": 0.1599, "num_input_tokens_seen": 33246976, "step": 33035 }, { "epoch": 15.577557755775578, "grad_norm": 0.11063312739133835, "learning_rate": 7.086100535552814e-06, "loss": 0.0245, "num_input_tokens_seen": 33252160, "step": 33040 }, { "epoch": 15.57991513437058, "grad_norm": 2.31632399559021, "learning_rate": 7.078927259473745e-06, "loss": 0.1625, "num_input_tokens_seen": 33256096, "step": 33045 }, { "epoch": 15.582272512965583, "grad_norm": 0.7946162819862366, "learning_rate": 7.071757017128011e-06, "loss": 0.1005, "num_input_tokens_seen": 33260672, "step": 33050 }, { "epoch": 15.584629891560585, "grad_norm": 0.7556843161582947, "learning_rate": 7.064589809729416e-06, "loss": 0.1106, "num_input_tokens_seen": 33265696, "step": 33055 }, { "epoch": 15.586987270155587, "grad_norm": 0.016981370747089386, "learning_rate": 7.05742563849123e-06, "loss": 0.0114, "num_input_tokens_seen": 33271424, "step": 33060 }, { "epoch": 15.58934464875059, "grad_norm": 0.24168525636196136, "learning_rate": 7.050264504626239e-06, "loss": 0.3097, "num_input_tokens_seen": 33276096, "step": 33065 }, { "epoch": 15.591702027345592, "grad_norm": 0.15289224684238434, "learning_rate": 7.043106409346698e-06, "loss": 0.0201, "num_input_tokens_seen": 33282272, "step": 33070 }, { "epoch": 15.594059405940595, "grad_norm": 2.2541704177856445, "learning_rate": 7.035951353864351e-06, "loss": 0.1717, "num_input_tokens_seen": 33286784, "step": 33075 }, { "epoch": 15.596416784535597, "grad_norm": 1.0562437772750854, "learning_rate": 7.028799339390427e-06, "loss": 0.0733, "num_input_tokens_seen": 33292736, "step": 33080 }, { "epoch": 15.5987741631306, "grad_norm": 0.7594422101974487, "learning_rate": 7.0216503671356406e-06, "loss": 0.0588, "num_input_tokens_seen": 33297696, "step": 33085 }, { "epoch": 15.601131541725602, "grad_norm": 0.2151300460100174, "learning_rate": 7.014504438310193e-06, "loss": 0.0984, "num_input_tokens_seen": 33303296, "step": 33090 }, { "epoch": 15.603488920320604, "grad_norm": 3.527050733566284, "learning_rate": 7.007361554123776e-06, "loss": 0.2227, "num_input_tokens_seen": 33308224, "step": 33095 }, { "epoch": 15.605846298915607, "grad_norm": 0.9379589557647705, "learning_rate": 7.0002217157855405e-06, "loss": 0.0276, "num_input_tokens_seen": 33312736, "step": 33100 }, { "epoch": 15.608203677510609, "grad_norm": 0.39290371537208557, "learning_rate": 6.993084924504151e-06, "loss": 0.0578, "num_input_tokens_seen": 33317824, "step": 33105 }, { "epoch": 15.61056105610561, "grad_norm": 0.07053545862436295, "learning_rate": 6.985951181487743e-06, "loss": 0.1324, "num_input_tokens_seen": 33322688, "step": 33110 }, { "epoch": 15.612918434700614, "grad_norm": 0.849258303642273, "learning_rate": 6.978820487943941e-06, "loss": 0.1456, "num_input_tokens_seen": 33327648, "step": 33115 }, { "epoch": 15.615275813295614, "grad_norm": 0.07719051837921143, "learning_rate": 6.971692845079847e-06, "loss": 0.0406, "num_input_tokens_seen": 33332992, "step": 33120 }, { "epoch": 15.617633191890617, "grad_norm": 0.1791747659444809, "learning_rate": 6.964568254102049e-06, "loss": 0.0743, "num_input_tokens_seen": 33338816, "step": 33125 }, { "epoch": 15.61999057048562, "grad_norm": 0.07332665473222733, "learning_rate": 6.9574467162166254e-06, "loss": 0.1464, "num_input_tokens_seen": 33343936, "step": 33130 }, { "epoch": 15.622347949080622, "grad_norm": 2.190606117248535, "learning_rate": 6.950328232629133e-06, "loss": 0.376, "num_input_tokens_seen": 33349152, "step": 33135 }, { "epoch": 15.624705327675624, "grad_norm": 0.09593641012907028, "learning_rate": 6.943212804544597e-06, "loss": 0.2444, "num_input_tokens_seen": 33355424, "step": 33140 }, { "epoch": 15.627062706270626, "grad_norm": 0.12633126974105835, "learning_rate": 6.936100433167547e-06, "loss": 0.1604, "num_input_tokens_seen": 33359584, "step": 33145 }, { "epoch": 15.629420084865629, "grad_norm": 1.298079013824463, "learning_rate": 6.928991119701986e-06, "loss": 0.118, "num_input_tokens_seen": 33364672, "step": 33150 }, { "epoch": 15.631777463460631, "grad_norm": 0.3904707729816437, "learning_rate": 6.9218848653513995e-06, "loss": 0.0377, "num_input_tokens_seen": 33370208, "step": 33155 }, { "epoch": 15.634134842055634, "grad_norm": 0.21537065505981445, "learning_rate": 6.914781671318757e-06, "loss": 0.1551, "num_input_tokens_seen": 33374656, "step": 33160 }, { "epoch": 15.636492220650636, "grad_norm": 0.28132253885269165, "learning_rate": 6.907681538806507e-06, "loss": 0.0621, "num_input_tokens_seen": 33380000, "step": 33165 }, { "epoch": 15.638849599245638, "grad_norm": 0.38825467228889465, "learning_rate": 6.900584469016583e-06, "loss": 0.0762, "num_input_tokens_seen": 33386688, "step": 33170 }, { "epoch": 15.64120697784064, "grad_norm": 0.5592832565307617, "learning_rate": 6.893490463150407e-06, "loss": 0.1624, "num_input_tokens_seen": 33392416, "step": 33175 }, { "epoch": 15.643564356435643, "grad_norm": 1.6909631490707397, "learning_rate": 6.886399522408854e-06, "loss": 0.0929, "num_input_tokens_seen": 33396416, "step": 33180 }, { "epoch": 15.645921735030646, "grad_norm": 0.30557993054389954, "learning_rate": 6.879311647992312e-06, "loss": 0.055, "num_input_tokens_seen": 33401024, "step": 33185 }, { "epoch": 15.648279113625648, "grad_norm": 0.11626944690942764, "learning_rate": 6.872226841100637e-06, "loss": 0.025, "num_input_tokens_seen": 33405888, "step": 33190 }, { "epoch": 15.65063649222065, "grad_norm": 1.1211824417114258, "learning_rate": 6.865145102933165e-06, "loss": 0.2617, "num_input_tokens_seen": 33410656, "step": 33195 }, { "epoch": 15.652993870815653, "grad_norm": 1.8244763612747192, "learning_rate": 6.8580664346887145e-06, "loss": 0.1041, "num_input_tokens_seen": 33414368, "step": 33200 }, { "epoch": 15.655351249410655, "grad_norm": 0.07957799732685089, "learning_rate": 6.850990837565585e-06, "loss": 0.1845, "num_input_tokens_seen": 33419776, "step": 33205 }, { "epoch": 15.657708628005658, "grad_norm": 0.6204526424407959, "learning_rate": 6.843918312761555e-06, "loss": 0.1317, "num_input_tokens_seen": 33425216, "step": 33210 }, { "epoch": 15.66006600660066, "grad_norm": 0.29102063179016113, "learning_rate": 6.836848861473888e-06, "loss": 0.1353, "num_input_tokens_seen": 33430432, "step": 33215 }, { "epoch": 15.662423385195662, "grad_norm": 0.11478228867053986, "learning_rate": 6.829782484899308e-06, "loss": 0.0351, "num_input_tokens_seen": 33434528, "step": 33220 }, { "epoch": 15.664780763790665, "grad_norm": 1.3311086893081665, "learning_rate": 6.822719184234039e-06, "loss": 0.0264, "num_input_tokens_seen": 33439488, "step": 33225 }, { "epoch": 15.667138142385667, "grad_norm": 1.5857083797454834, "learning_rate": 6.815658960673782e-06, "loss": 0.1289, "num_input_tokens_seen": 33444192, "step": 33230 }, { "epoch": 15.66949552098067, "grad_norm": 0.017142299562692642, "learning_rate": 6.808601815413709e-06, "loss": 0.1337, "num_input_tokens_seen": 33448416, "step": 33235 }, { "epoch": 15.671852899575672, "grad_norm": 0.09085556864738464, "learning_rate": 6.8015477496484794e-06, "loss": 0.0129, "num_input_tokens_seen": 33453440, "step": 33240 }, { "epoch": 15.674210278170674, "grad_norm": 0.6063770651817322, "learning_rate": 6.7944967645722195e-06, "loss": 0.1008, "num_input_tokens_seen": 33459168, "step": 33245 }, { "epoch": 15.676567656765677, "grad_norm": 0.16048474609851837, "learning_rate": 6.787448861378548e-06, "loss": 0.2952, "num_input_tokens_seen": 33464224, "step": 33250 }, { "epoch": 15.67892503536068, "grad_norm": 1.7169245481491089, "learning_rate": 6.78040404126056e-06, "loss": 0.0964, "num_input_tokens_seen": 33468416, "step": 33255 }, { "epoch": 15.681282413955682, "grad_norm": 2.742941379547119, "learning_rate": 6.773362305410816e-06, "loss": 0.137, "num_input_tokens_seen": 33474336, "step": 33260 }, { "epoch": 15.683639792550684, "grad_norm": 0.03652486950159073, "learning_rate": 6.766323655021356e-06, "loss": 0.016, "num_input_tokens_seen": 33479104, "step": 33265 }, { "epoch": 15.685997171145686, "grad_norm": 1.0960606336593628, "learning_rate": 6.759288091283711e-06, "loss": 0.2063, "num_input_tokens_seen": 33484224, "step": 33270 }, { "epoch": 15.688354549740689, "grad_norm": 1.6973304748535156, "learning_rate": 6.752255615388883e-06, "loss": 0.0615, "num_input_tokens_seen": 33491328, "step": 33275 }, { "epoch": 15.690711928335691, "grad_norm": 0.2957333028316498, "learning_rate": 6.745226228527348e-06, "loss": 0.0231, "num_input_tokens_seen": 33496640, "step": 33280 }, { "epoch": 15.693069306930694, "grad_norm": 1.0746923685073853, "learning_rate": 6.738199931889066e-06, "loss": 0.0523, "num_input_tokens_seen": 33501248, "step": 33285 }, { "epoch": 15.695426685525696, "grad_norm": 0.43911606073379517, "learning_rate": 6.731176726663469e-06, "loss": 0.1354, "num_input_tokens_seen": 33506528, "step": 33290 }, { "epoch": 15.697784064120698, "grad_norm": 0.22020961344242096, "learning_rate": 6.724156614039462e-06, "loss": 0.0511, "num_input_tokens_seen": 33511232, "step": 33295 }, { "epoch": 15.700141442715701, "grad_norm": 0.8797290325164795, "learning_rate": 6.717139595205443e-06, "loss": 0.1623, "num_input_tokens_seen": 33515840, "step": 33300 }, { "epoch": 15.702498821310703, "grad_norm": 1.273348093032837, "learning_rate": 6.71012567134926e-06, "loss": 0.1254, "num_input_tokens_seen": 33520704, "step": 33305 }, { "epoch": 15.704856199905706, "grad_norm": 0.4455481171607971, "learning_rate": 6.703114843658254e-06, "loss": 0.0998, "num_input_tokens_seen": 33527264, "step": 33310 }, { "epoch": 15.707213578500706, "grad_norm": 2.5029995441436768, "learning_rate": 6.696107113319242e-06, "loss": 0.1507, "num_input_tokens_seen": 33531584, "step": 33315 }, { "epoch": 15.70957095709571, "grad_norm": 0.22867298126220703, "learning_rate": 6.689102481518514e-06, "loss": 0.0962, "num_input_tokens_seen": 33535648, "step": 33320 }, { "epoch": 15.711928335690711, "grad_norm": 1.9890888929367065, "learning_rate": 6.682100949441833e-06, "loss": 0.3897, "num_input_tokens_seen": 33540928, "step": 33325 }, { "epoch": 15.714285714285714, "grad_norm": 0.2943264842033386, "learning_rate": 6.675102518274443e-06, "loss": 0.0121, "num_input_tokens_seen": 33545312, "step": 33330 }, { "epoch": 15.716643092880716, "grad_norm": 0.09947054088115692, "learning_rate": 6.668107189201056e-06, "loss": 0.0855, "num_input_tokens_seen": 33550496, "step": 33335 }, { "epoch": 15.719000471475718, "grad_norm": 0.46335092186927795, "learning_rate": 6.661114963405871e-06, "loss": 0.0831, "num_input_tokens_seen": 33555008, "step": 33340 }, { "epoch": 15.72135785007072, "grad_norm": 0.19614480435848236, "learning_rate": 6.65412584207254e-06, "loss": 0.1528, "num_input_tokens_seen": 33559104, "step": 33345 }, { "epoch": 15.723715228665723, "grad_norm": 2.5533909797668457, "learning_rate": 6.647139826384208e-06, "loss": 0.0633, "num_input_tokens_seen": 33563680, "step": 33350 }, { "epoch": 15.726072607260726, "grad_norm": 0.21509763598442078, "learning_rate": 6.640156917523491e-06, "loss": 0.0665, "num_input_tokens_seen": 33568064, "step": 33355 }, { "epoch": 15.728429985855728, "grad_norm": 2.39916729927063, "learning_rate": 6.633177116672473e-06, "loss": 0.1816, "num_input_tokens_seen": 33573856, "step": 33360 }, { "epoch": 15.73078736445073, "grad_norm": 0.2655794024467468, "learning_rate": 6.626200425012721e-06, "loss": 0.1747, "num_input_tokens_seen": 33579328, "step": 33365 }, { "epoch": 15.733144743045733, "grad_norm": 0.9079443216323853, "learning_rate": 6.619226843725265e-06, "loss": 0.1734, "num_input_tokens_seen": 33583520, "step": 33370 }, { "epoch": 15.735502121640735, "grad_norm": 0.057239383459091187, "learning_rate": 6.612256373990619e-06, "loss": 0.0753, "num_input_tokens_seen": 33589152, "step": 33375 }, { "epoch": 15.737859500235738, "grad_norm": 0.685539186000824, "learning_rate": 6.605289016988761e-06, "loss": 0.2003, "num_input_tokens_seen": 33593312, "step": 33380 }, { "epoch": 15.74021687883074, "grad_norm": 0.1162046566605568, "learning_rate": 6.598324773899156e-06, "loss": 0.1925, "num_input_tokens_seen": 33598016, "step": 33385 }, { "epoch": 15.742574257425742, "grad_norm": 0.2787592113018036, "learning_rate": 6.591363645900719e-06, "loss": 0.0785, "num_input_tokens_seen": 33602560, "step": 33390 }, { "epoch": 15.744931636020745, "grad_norm": 1.644895076751709, "learning_rate": 6.584405634171853e-06, "loss": 0.185, "num_input_tokens_seen": 33607360, "step": 33395 }, { "epoch": 15.747289014615747, "grad_norm": 0.05146046355366707, "learning_rate": 6.577450739890434e-06, "loss": 0.1164, "num_input_tokens_seen": 33612224, "step": 33400 }, { "epoch": 15.74964639321075, "grad_norm": 0.8735230565071106, "learning_rate": 6.570498964233809e-06, "loss": 0.0508, "num_input_tokens_seen": 33617088, "step": 33405 }, { "epoch": 15.752003771805752, "grad_norm": 1.0152182579040527, "learning_rate": 6.563550308378794e-06, "loss": 0.1725, "num_input_tokens_seen": 33623264, "step": 33410 }, { "epoch": 15.754361150400754, "grad_norm": 1.4502960443496704, "learning_rate": 6.556604773501679e-06, "loss": 0.1248, "num_input_tokens_seen": 33627776, "step": 33415 }, { "epoch": 15.756718528995757, "grad_norm": 0.4640132188796997, "learning_rate": 6.549662360778225e-06, "loss": 0.0953, "num_input_tokens_seen": 33632416, "step": 33420 }, { "epoch": 15.75907590759076, "grad_norm": 1.3007856607437134, "learning_rate": 6.542723071383672e-06, "loss": 0.0653, "num_input_tokens_seen": 33637312, "step": 33425 }, { "epoch": 15.761433286185762, "grad_norm": 1.5852627754211426, "learning_rate": 6.535786906492708e-06, "loss": 0.1651, "num_input_tokens_seen": 33642528, "step": 33430 }, { "epoch": 15.763790664780764, "grad_norm": 0.2705952227115631, "learning_rate": 6.5288538672795166e-06, "loss": 0.1003, "num_input_tokens_seen": 33646912, "step": 33435 }, { "epoch": 15.766148043375766, "grad_norm": 0.053336068987846375, "learning_rate": 6.521923954917741e-06, "loss": 0.0258, "num_input_tokens_seen": 33652672, "step": 33440 }, { "epoch": 15.768505421970769, "grad_norm": 1.2656958103179932, "learning_rate": 6.514997170580503e-06, "loss": 0.1443, "num_input_tokens_seen": 33658336, "step": 33445 }, { "epoch": 15.770862800565771, "grad_norm": 2.647404193878174, "learning_rate": 6.508073515440385e-06, "loss": 0.1843, "num_input_tokens_seen": 33663712, "step": 33450 }, { "epoch": 15.773220179160774, "grad_norm": 1.1290072202682495, "learning_rate": 6.501152990669445e-06, "loss": 0.1303, "num_input_tokens_seen": 33669280, "step": 33455 }, { "epoch": 15.775577557755776, "grad_norm": 0.29405057430267334, "learning_rate": 6.4942355974392135e-06, "loss": 0.1157, "num_input_tokens_seen": 33673600, "step": 33460 }, { "epoch": 15.777934936350778, "grad_norm": 0.8423271775245667, "learning_rate": 6.48732133692069e-06, "loss": 0.0778, "num_input_tokens_seen": 33677760, "step": 33465 }, { "epoch": 15.78029231494578, "grad_norm": 1.8287947177886963, "learning_rate": 6.480410210284332e-06, "loss": 0.1128, "num_input_tokens_seen": 33683008, "step": 33470 }, { "epoch": 15.782649693540783, "grad_norm": 1.1931320428848267, "learning_rate": 6.473502218700081e-06, "loss": 0.0973, "num_input_tokens_seen": 33687936, "step": 33475 }, { "epoch": 15.785007072135786, "grad_norm": 0.5973535180091858, "learning_rate": 6.46659736333734e-06, "loss": 0.0742, "num_input_tokens_seen": 33692352, "step": 33480 }, { "epoch": 15.787364450730788, "grad_norm": 0.03197470307350159, "learning_rate": 6.459695645364994e-06, "loss": 0.1094, "num_input_tokens_seen": 33697056, "step": 33485 }, { "epoch": 15.78972182932579, "grad_norm": 0.08585090190172195, "learning_rate": 6.452797065951374e-06, "loss": 0.1441, "num_input_tokens_seen": 33701536, "step": 33490 }, { "epoch": 15.792079207920793, "grad_norm": 0.4656238555908203, "learning_rate": 6.445901626264295e-06, "loss": 0.0577, "num_input_tokens_seen": 33707104, "step": 33495 }, { "epoch": 15.794436586515795, "grad_norm": 0.5605508089065552, "learning_rate": 6.439009327471038e-06, "loss": 0.0826, "num_input_tokens_seen": 33712352, "step": 33500 }, { "epoch": 15.796793965110798, "grad_norm": 1.3635313510894775, "learning_rate": 6.432120170738357e-06, "loss": 0.1062, "num_input_tokens_seen": 33718176, "step": 33505 }, { "epoch": 15.799151343705798, "grad_norm": 0.0455726720392704, "learning_rate": 6.425234157232473e-06, "loss": 0.0879, "num_input_tokens_seen": 33724064, "step": 33510 }, { "epoch": 15.801508722300802, "grad_norm": 1.2288942337036133, "learning_rate": 6.418351288119057e-06, "loss": 0.095, "num_input_tokens_seen": 33728640, "step": 33515 }, { "epoch": 15.803866100895803, "grad_norm": 0.06132400408387184, "learning_rate": 6.41147156456327e-06, "loss": 0.0553, "num_input_tokens_seen": 33733472, "step": 33520 }, { "epoch": 15.806223479490805, "grad_norm": 0.5570834875106812, "learning_rate": 6.404594987729731e-06, "loss": 0.1827, "num_input_tokens_seen": 33738240, "step": 33525 }, { "epoch": 15.808580858085808, "grad_norm": 0.24309243261814117, "learning_rate": 6.39772155878253e-06, "loss": 0.132, "num_input_tokens_seen": 33743392, "step": 33530 }, { "epoch": 15.81093823668081, "grad_norm": 1.4489707946777344, "learning_rate": 6.390851278885221e-06, "loss": 0.1207, "num_input_tokens_seen": 33747776, "step": 33535 }, { "epoch": 15.813295615275813, "grad_norm": 0.49195826053619385, "learning_rate": 6.383984149200822e-06, "loss": 0.118, "num_input_tokens_seen": 33752576, "step": 33540 }, { "epoch": 15.815652993870815, "grad_norm": 1.5505211353302002, "learning_rate": 6.377120170891829e-06, "loss": 0.129, "num_input_tokens_seen": 33757472, "step": 33545 }, { "epoch": 15.818010372465817, "grad_norm": 1.483494758605957, "learning_rate": 6.3702593451201966e-06, "loss": 0.1905, "num_input_tokens_seen": 33762272, "step": 33550 }, { "epoch": 15.82036775106082, "grad_norm": 0.2172517329454422, "learning_rate": 6.363401673047334e-06, "loss": 0.0259, "num_input_tokens_seen": 33766592, "step": 33555 }, { "epoch": 15.822725129655822, "grad_norm": 0.9491157531738281, "learning_rate": 6.356547155834139e-06, "loss": 0.0709, "num_input_tokens_seen": 33771808, "step": 33560 }, { "epoch": 15.825082508250825, "grad_norm": 1.3990448713302612, "learning_rate": 6.349695794640961e-06, "loss": 0.091, "num_input_tokens_seen": 33777408, "step": 33565 }, { "epoch": 15.827439886845827, "grad_norm": 0.16192319989204407, "learning_rate": 6.342847590627623e-06, "loss": 0.0392, "num_input_tokens_seen": 33781600, "step": 33570 }, { "epoch": 15.82979726544083, "grad_norm": 0.309558629989624, "learning_rate": 6.336002544953407e-06, "loss": 0.1344, "num_input_tokens_seen": 33786112, "step": 33575 }, { "epoch": 15.832154644035832, "grad_norm": 1.5827394723892212, "learning_rate": 6.329160658777061e-06, "loss": 0.2009, "num_input_tokens_seen": 33791232, "step": 33580 }, { "epoch": 15.834512022630834, "grad_norm": 0.438007652759552, "learning_rate": 6.322321933256805e-06, "loss": 0.1324, "num_input_tokens_seen": 33796544, "step": 33585 }, { "epoch": 15.836869401225837, "grad_norm": 1.1287184953689575, "learning_rate": 6.315486369550322e-06, "loss": 0.0793, "num_input_tokens_seen": 33801920, "step": 33590 }, { "epoch": 15.839226779820839, "grad_norm": 0.0644063875079155, "learning_rate": 6.308653968814746e-06, "loss": 0.1327, "num_input_tokens_seen": 33806624, "step": 33595 }, { "epoch": 15.841584158415841, "grad_norm": 0.5859617590904236, "learning_rate": 6.301824732206693e-06, "loss": 0.0291, "num_input_tokens_seen": 33811744, "step": 33600 }, { "epoch": 15.843941537010844, "grad_norm": 0.0721796303987503, "learning_rate": 6.2949986608822345e-06, "loss": 0.1667, "num_input_tokens_seen": 33816832, "step": 33605 }, { "epoch": 15.846298915605846, "grad_norm": 1.2005517482757568, "learning_rate": 6.288175755996911e-06, "loss": 0.0863, "num_input_tokens_seen": 33822464, "step": 33610 }, { "epoch": 15.848656294200849, "grad_norm": 0.07045619189739227, "learning_rate": 6.2813560187057225e-06, "loss": 0.1802, "num_input_tokens_seen": 33826560, "step": 33615 }, { "epoch": 15.851013672795851, "grad_norm": 0.05162432789802551, "learning_rate": 6.274539450163133e-06, "loss": 0.0661, "num_input_tokens_seen": 33831072, "step": 33620 }, { "epoch": 15.853371051390853, "grad_norm": 0.06212529540061951, "learning_rate": 6.267726051523079e-06, "loss": 0.0063, "num_input_tokens_seen": 33836032, "step": 33625 }, { "epoch": 15.855728429985856, "grad_norm": 0.11463986337184906, "learning_rate": 6.260915823938954e-06, "loss": 0.0389, "num_input_tokens_seen": 33841824, "step": 33630 }, { "epoch": 15.858085808580858, "grad_norm": 1.375197172164917, "learning_rate": 6.254108768563599e-06, "loss": 0.1512, "num_input_tokens_seen": 33846016, "step": 33635 }, { "epoch": 15.86044318717586, "grad_norm": 2.444526433944702, "learning_rate": 6.247304886549346e-06, "loss": 0.1231, "num_input_tokens_seen": 33851456, "step": 33640 }, { "epoch": 15.862800565770863, "grad_norm": 0.062109652906656265, "learning_rate": 6.240504179047971e-06, "loss": 0.1587, "num_input_tokens_seen": 33857024, "step": 33645 }, { "epoch": 15.865157944365865, "grad_norm": 1.1957758665084839, "learning_rate": 6.233706647210722e-06, "loss": 0.0797, "num_input_tokens_seen": 33861952, "step": 33650 }, { "epoch": 15.867515322960868, "grad_norm": 1.0348924398422241, "learning_rate": 6.2269122921883e-06, "loss": 0.1247, "num_input_tokens_seen": 33866432, "step": 33655 }, { "epoch": 15.86987270155587, "grad_norm": 1.5641347169876099, "learning_rate": 6.2201211151308806e-06, "loss": 0.1628, "num_input_tokens_seen": 33871104, "step": 33660 }, { "epoch": 15.872230080150873, "grad_norm": 1.1448261737823486, "learning_rate": 6.213333117188095e-06, "loss": 0.2646, "num_input_tokens_seen": 33875840, "step": 33665 }, { "epoch": 15.874587458745875, "grad_norm": 0.10477610677480698, "learning_rate": 6.206548299509038e-06, "loss": 0.1691, "num_input_tokens_seen": 33880928, "step": 33670 }, { "epoch": 15.876944837340877, "grad_norm": 0.34121060371398926, "learning_rate": 6.199766663242252e-06, "loss": 0.1904, "num_input_tokens_seen": 33887168, "step": 33675 }, { "epoch": 15.87930221593588, "grad_norm": 0.5946184992790222, "learning_rate": 6.192988209535757e-06, "loss": 0.0893, "num_input_tokens_seen": 33891744, "step": 33680 }, { "epoch": 15.881659594530882, "grad_norm": 1.588183045387268, "learning_rate": 6.186212939537039e-06, "loss": 0.1326, "num_input_tokens_seen": 33896960, "step": 33685 }, { "epoch": 15.884016973125885, "grad_norm": 1.6950818300247192, "learning_rate": 6.179440854393026e-06, "loss": 0.1224, "num_input_tokens_seen": 33901152, "step": 33690 }, { "epoch": 15.886374351720887, "grad_norm": 0.6115970611572266, "learning_rate": 6.172671955250126e-06, "loss": 0.2039, "num_input_tokens_seen": 33905440, "step": 33695 }, { "epoch": 15.88873173031589, "grad_norm": 0.03128721937537193, "learning_rate": 6.165906243254191e-06, "loss": 0.0856, "num_input_tokens_seen": 33910464, "step": 33700 }, { "epoch": 15.891089108910892, "grad_norm": 1.708855152130127, "learning_rate": 6.159143719550542e-06, "loss": 0.2945, "num_input_tokens_seen": 33914944, "step": 33705 }, { "epoch": 15.893446487505894, "grad_norm": 0.1358027309179306, "learning_rate": 6.152384385283974e-06, "loss": 0.1919, "num_input_tokens_seen": 33919616, "step": 33710 }, { "epoch": 15.895803866100895, "grad_norm": 0.07568255811929703, "learning_rate": 6.145628241598705e-06, "loss": 0.1218, "num_input_tokens_seen": 33923808, "step": 33715 }, { "epoch": 15.898161244695899, "grad_norm": 0.132386714220047, "learning_rate": 6.138875289638455e-06, "loss": 0.0703, "num_input_tokens_seen": 33928192, "step": 33720 }, { "epoch": 15.9005186232909, "grad_norm": 1.4145958423614502, "learning_rate": 6.132125530546365e-06, "loss": 0.0799, "num_input_tokens_seen": 33933632, "step": 33725 }, { "epoch": 15.902876001885902, "grad_norm": 0.343506395816803, "learning_rate": 6.125378965465065e-06, "loss": 0.08, "num_input_tokens_seen": 33938880, "step": 33730 }, { "epoch": 15.905233380480905, "grad_norm": 0.3085334897041321, "learning_rate": 6.118635595536634e-06, "loss": 0.1393, "num_input_tokens_seen": 33944896, "step": 33735 }, { "epoch": 15.907590759075907, "grad_norm": 1.4695616960525513, "learning_rate": 6.111895421902608e-06, "loss": 0.0737, "num_input_tokens_seen": 33949760, "step": 33740 }, { "epoch": 15.90994813767091, "grad_norm": 0.22264093160629272, "learning_rate": 6.105158445703987e-06, "loss": 0.03, "num_input_tokens_seen": 33955712, "step": 33745 }, { "epoch": 15.912305516265912, "grad_norm": 1.4891557693481445, "learning_rate": 6.098424668081226e-06, "loss": 0.1801, "num_input_tokens_seen": 33960064, "step": 33750 }, { "epoch": 15.914662894860914, "grad_norm": 1.7065824270248413, "learning_rate": 6.0916940901742355e-06, "loss": 0.2589, "num_input_tokens_seen": 33964960, "step": 33755 }, { "epoch": 15.917020273455917, "grad_norm": 0.6075195670127869, "learning_rate": 6.0849667131224005e-06, "loss": 0.0431, "num_input_tokens_seen": 33970176, "step": 33760 }, { "epoch": 15.919377652050919, "grad_norm": 0.27218642830848694, "learning_rate": 6.0782425380645325e-06, "loss": 0.128, "num_input_tokens_seen": 33974592, "step": 33765 }, { "epoch": 15.921735030645921, "grad_norm": 0.09925652295351028, "learning_rate": 6.071521566138929e-06, "loss": 0.242, "num_input_tokens_seen": 33978560, "step": 33770 }, { "epoch": 15.924092409240924, "grad_norm": 0.20295387506484985, "learning_rate": 6.06480379848334e-06, "loss": 0.0316, "num_input_tokens_seen": 33983584, "step": 33775 }, { "epoch": 15.926449787835926, "grad_norm": 0.08925720304250717, "learning_rate": 6.058089236234965e-06, "loss": 0.0811, "num_input_tokens_seen": 33990624, "step": 33780 }, { "epoch": 15.928807166430929, "grad_norm": 0.49384528398513794, "learning_rate": 6.0513778805304675e-06, "loss": 0.286, "num_input_tokens_seen": 33995296, "step": 33785 }, { "epoch": 15.931164545025931, "grad_norm": 2.684417724609375, "learning_rate": 6.044669732505967e-06, "loss": 0.2555, "num_input_tokens_seen": 33999840, "step": 33790 }, { "epoch": 15.933521923620933, "grad_norm": 0.1471414715051651, "learning_rate": 6.037964793297035e-06, "loss": 0.0226, "num_input_tokens_seen": 34004768, "step": 33795 }, { "epoch": 15.935879302215936, "grad_norm": 1.3554807901382446, "learning_rate": 6.0312630640387145e-06, "loss": 0.0946, "num_input_tokens_seen": 34009120, "step": 33800 }, { "epoch": 15.938236680810938, "grad_norm": 2.0788702964782715, "learning_rate": 6.02456454586548e-06, "loss": 0.1176, "num_input_tokens_seen": 34014752, "step": 33805 }, { "epoch": 15.94059405940594, "grad_norm": 0.5355360507965088, "learning_rate": 6.017869239911281e-06, "loss": 0.0407, "num_input_tokens_seen": 34019616, "step": 33810 }, { "epoch": 15.942951438000943, "grad_norm": 0.7067227363586426, "learning_rate": 6.011177147309524e-06, "loss": 0.1018, "num_input_tokens_seen": 34023680, "step": 33815 }, { "epoch": 15.945308816595945, "grad_norm": 0.15938130021095276, "learning_rate": 6.004488269193062e-06, "loss": 0.1678, "num_input_tokens_seen": 34029760, "step": 33820 }, { "epoch": 15.947666195190948, "grad_norm": 0.12210110574960709, "learning_rate": 5.997802606694214e-06, "loss": 0.1801, "num_input_tokens_seen": 34034208, "step": 33825 }, { "epoch": 15.95002357378595, "grad_norm": 0.8401036262512207, "learning_rate": 5.991120160944744e-06, "loss": 0.1322, "num_input_tokens_seen": 34038880, "step": 33830 }, { "epoch": 15.952380952380953, "grad_norm": 0.3506389558315277, "learning_rate": 5.984440933075877e-06, "loss": 0.121, "num_input_tokens_seen": 34043712, "step": 33835 }, { "epoch": 15.954738330975955, "grad_norm": 0.1217275932431221, "learning_rate": 5.9777649242183016e-06, "loss": 0.171, "num_input_tokens_seen": 34048160, "step": 33840 }, { "epoch": 15.957095709570957, "grad_norm": 0.19372934103012085, "learning_rate": 5.97109213550214e-06, "loss": 0.0625, "num_input_tokens_seen": 34053216, "step": 33845 }, { "epoch": 15.95945308816596, "grad_norm": 2.90490460395813, "learning_rate": 5.964422568056985e-06, "loss": 0.1164, "num_input_tokens_seen": 34057664, "step": 33850 }, { "epoch": 15.961810466760962, "grad_norm": 0.17748752236366272, "learning_rate": 5.957756223011885e-06, "loss": 0.241, "num_input_tokens_seen": 34061504, "step": 33855 }, { "epoch": 15.964167845355965, "grad_norm": 0.2196532040834427, "learning_rate": 5.951093101495336e-06, "loss": 0.1148, "num_input_tokens_seen": 34065952, "step": 33860 }, { "epoch": 15.966525223950967, "grad_norm": 1.0288450717926025, "learning_rate": 5.944433204635297e-06, "loss": 0.0913, "num_input_tokens_seen": 34070336, "step": 33865 }, { "epoch": 15.96888260254597, "grad_norm": 0.058891359716653824, "learning_rate": 5.937776533559167e-06, "loss": 0.0334, "num_input_tokens_seen": 34074656, "step": 33870 }, { "epoch": 15.971239981140972, "grad_norm": 0.15966090559959412, "learning_rate": 5.9311230893938145e-06, "loss": 0.0902, "num_input_tokens_seen": 34079456, "step": 33875 }, { "epoch": 15.973597359735974, "grad_norm": 0.8038545846939087, "learning_rate": 5.9244728732655575e-06, "loss": 0.1919, "num_input_tokens_seen": 34083680, "step": 33880 }, { "epoch": 15.975954738330977, "grad_norm": 0.16756831109523773, "learning_rate": 5.917825886300152e-06, "loss": 0.0905, "num_input_tokens_seen": 34088448, "step": 33885 }, { "epoch": 15.978312116925979, "grad_norm": 1.5603718757629395, "learning_rate": 5.911182129622828e-06, "loss": 0.1022, "num_input_tokens_seen": 34092352, "step": 33890 }, { "epoch": 15.980669495520981, "grad_norm": 1.073612928390503, "learning_rate": 5.904541604358257e-06, "loss": 0.039, "num_input_tokens_seen": 34097376, "step": 33895 }, { "epoch": 15.983026874115984, "grad_norm": 0.13015754520893097, "learning_rate": 5.897904311630573e-06, "loss": 0.1427, "num_input_tokens_seen": 34102880, "step": 33900 }, { "epoch": 15.985384252710986, "grad_norm": 0.07224909961223602, "learning_rate": 5.891270252563352e-06, "loss": 0.0833, "num_input_tokens_seen": 34107488, "step": 33905 }, { "epoch": 15.987741631305987, "grad_norm": 0.019809814170002937, "learning_rate": 5.884639428279632e-06, "loss": 0.0768, "num_input_tokens_seen": 34111520, "step": 33910 }, { "epoch": 15.990099009900991, "grad_norm": 0.8884164690971375, "learning_rate": 5.878011839901895e-06, "loss": 0.0295, "num_input_tokens_seen": 34116640, "step": 33915 }, { "epoch": 15.992456388495992, "grad_norm": 1.6839830875396729, "learning_rate": 5.871387488552088e-06, "loss": 0.0983, "num_input_tokens_seen": 34122432, "step": 33920 }, { "epoch": 15.994813767090994, "grad_norm": 2.249452829360962, "learning_rate": 5.8647663753515854e-06, "loss": 0.1305, "num_input_tokens_seen": 34127360, "step": 33925 }, { "epoch": 15.997171145685996, "grad_norm": 0.95669025182724, "learning_rate": 5.85814850142124e-06, "loss": 0.0974, "num_input_tokens_seen": 34133376, "step": 33930 }, { "epoch": 15.999528524280999, "grad_norm": 0.8712288737297058, "learning_rate": 5.851533867881348e-06, "loss": 0.0928, "num_input_tokens_seen": 34137120, "step": 33935 }, { "epoch": 16.0, "eval_loss": 0.1536126285791397, "eval_runtime": 15.153, "eval_samples_per_second": 62.232, "eval_steps_per_second": 15.574, "num_input_tokens_seen": 34138272, "step": 33936 }, { "epoch": 16.001885902876, "grad_norm": 0.058302637189626694, "learning_rate": 5.844922475851644e-06, "loss": 0.0461, "num_input_tokens_seen": 34142944, "step": 33940 }, { "epoch": 16.004243281471005, "grad_norm": 1.044576644897461, "learning_rate": 5.83831432645133e-06, "loss": 0.0548, "num_input_tokens_seen": 34147552, "step": 33945 }, { "epoch": 16.006600660066006, "grad_norm": 0.17635118961334229, "learning_rate": 5.831709420799053e-06, "loss": 0.118, "num_input_tokens_seen": 34151296, "step": 33950 }, { "epoch": 16.00895803866101, "grad_norm": 0.08399388939142227, "learning_rate": 5.825107760012913e-06, "loss": 0.0763, "num_input_tokens_seen": 34155424, "step": 33955 }, { "epoch": 16.01131541725601, "grad_norm": 0.015708163380622864, "learning_rate": 5.818509345210457e-06, "loss": 0.0197, "num_input_tokens_seen": 34160736, "step": 33960 }, { "epoch": 16.013672795851015, "grad_norm": 0.01599152386188507, "learning_rate": 5.811914177508684e-06, "loss": 0.0148, "num_input_tokens_seen": 34165248, "step": 33965 }, { "epoch": 16.016030174446016, "grad_norm": 0.04316045716404915, "learning_rate": 5.805322258024057e-06, "loss": 0.2167, "num_input_tokens_seen": 34170880, "step": 33970 }, { "epoch": 16.01838755304102, "grad_norm": 2.4062793254852295, "learning_rate": 5.798733587872454e-06, "loss": 0.0911, "num_input_tokens_seen": 34175296, "step": 33975 }, { "epoch": 16.02074493163602, "grad_norm": 0.4509084224700928, "learning_rate": 5.7921481681692366e-06, "loss": 0.0604, "num_input_tokens_seen": 34180768, "step": 33980 }, { "epoch": 16.023102310231025, "grad_norm": 0.7243186235427856, "learning_rate": 5.785566000029205e-06, "loss": 0.0747, "num_input_tokens_seen": 34185888, "step": 33985 }, { "epoch": 16.025459688826025, "grad_norm": 0.9691286683082581, "learning_rate": 5.778987084566606e-06, "loss": 0.047, "num_input_tokens_seen": 34190752, "step": 33990 }, { "epoch": 16.02781706742103, "grad_norm": 0.8931372761726379, "learning_rate": 5.772411422895138e-06, "loss": 0.1609, "num_input_tokens_seen": 34196320, "step": 33995 }, { "epoch": 16.03017444601603, "grad_norm": 0.44581860303878784, "learning_rate": 5.765839016127953e-06, "loss": 0.196, "num_input_tokens_seen": 34200832, "step": 34000 }, { "epoch": 16.032531824611034, "grad_norm": 1.170556902885437, "learning_rate": 5.759269865377642e-06, "loss": 0.1611, "num_input_tokens_seen": 34206816, "step": 34005 }, { "epoch": 16.034889203206035, "grad_norm": 0.7461780309677124, "learning_rate": 5.7527039717562625e-06, "loss": 0.1998, "num_input_tokens_seen": 34211744, "step": 34010 }, { "epoch": 16.03724658180104, "grad_norm": 1.8023725748062134, "learning_rate": 5.746141336375294e-06, "loss": 0.1931, "num_input_tokens_seen": 34217024, "step": 34015 }, { "epoch": 16.03960396039604, "grad_norm": 0.03834667056798935, "learning_rate": 5.739581960345683e-06, "loss": 0.0818, "num_input_tokens_seen": 34222208, "step": 34020 }, { "epoch": 16.04196133899104, "grad_norm": 1.0392229557037354, "learning_rate": 5.733025844777825e-06, "loss": 0.198, "num_input_tokens_seen": 34228416, "step": 34025 }, { "epoch": 16.044318717586044, "grad_norm": 0.4872105121612549, "learning_rate": 5.726472990781556e-06, "loss": 0.0411, "num_input_tokens_seen": 34232544, "step": 34030 }, { "epoch": 16.046676096181045, "grad_norm": 0.032427459955215454, "learning_rate": 5.719923399466165e-06, "loss": 0.0268, "num_input_tokens_seen": 34236992, "step": 34035 }, { "epoch": 16.04903347477605, "grad_norm": 0.0672335997223854, "learning_rate": 5.713377071940384e-06, "loss": 0.0341, "num_input_tokens_seen": 34243168, "step": 34040 }, { "epoch": 16.05139085337105, "grad_norm": 0.38603025674819946, "learning_rate": 5.706834009312398e-06, "loss": 0.1859, "num_input_tokens_seen": 34248416, "step": 34045 }, { "epoch": 16.053748231966054, "grad_norm": 0.47934871912002563, "learning_rate": 5.700294212689838e-06, "loss": 0.025, "num_input_tokens_seen": 34253280, "step": 34050 }, { "epoch": 16.056105610561055, "grad_norm": 0.1602805256843567, "learning_rate": 5.693757683179774e-06, "loss": 0.0671, "num_input_tokens_seen": 34257952, "step": 34055 }, { "epoch": 16.05846298915606, "grad_norm": 1.5127544403076172, "learning_rate": 5.6872244218887314e-06, "loss": 0.1571, "num_input_tokens_seen": 34262944, "step": 34060 }, { "epoch": 16.06082036775106, "grad_norm": 1.058732271194458, "learning_rate": 5.680694429922684e-06, "loss": 0.0463, "num_input_tokens_seen": 34268416, "step": 34065 }, { "epoch": 16.063177746346064, "grad_norm": 0.02204454317688942, "learning_rate": 5.674167708387046e-06, "loss": 0.2878, "num_input_tokens_seen": 34273376, "step": 34070 }, { "epoch": 16.065535124941064, "grad_norm": 0.4952166974544525, "learning_rate": 5.667644258386678e-06, "loss": 0.0366, "num_input_tokens_seen": 34278688, "step": 34075 }, { "epoch": 16.06789250353607, "grad_norm": 0.8282880783081055, "learning_rate": 5.661124081025895e-06, "loss": 0.0462, "num_input_tokens_seen": 34284352, "step": 34080 }, { "epoch": 16.07024988213107, "grad_norm": 0.6805248856544495, "learning_rate": 5.65460717740845e-06, "loss": 0.0654, "num_input_tokens_seen": 34291264, "step": 34085 }, { "epoch": 16.072607260726073, "grad_norm": 1.2568638324737549, "learning_rate": 5.648093548637551e-06, "loss": 0.1325, "num_input_tokens_seen": 34296192, "step": 34090 }, { "epoch": 16.074964639321074, "grad_norm": 0.20937320590019226, "learning_rate": 5.641583195815828e-06, "loss": 0.0643, "num_input_tokens_seen": 34300672, "step": 34095 }, { "epoch": 16.077322017916078, "grad_norm": 0.04874250665307045, "learning_rate": 5.635076120045385e-06, "loss": 0.152, "num_input_tokens_seen": 34305472, "step": 34100 }, { "epoch": 16.07967939651108, "grad_norm": 1.1337172985076904, "learning_rate": 5.628572322427755e-06, "loss": 0.1945, "num_input_tokens_seen": 34311008, "step": 34105 }, { "epoch": 16.082036775106083, "grad_norm": 0.13695013523101807, "learning_rate": 5.6220718040639244e-06, "loss": 0.1173, "num_input_tokens_seen": 34314944, "step": 34110 }, { "epoch": 16.084394153701083, "grad_norm": 0.46263569593429565, "learning_rate": 5.615574566054319e-06, "loss": 0.066, "num_input_tokens_seen": 34320672, "step": 34115 }, { "epoch": 16.086751532296088, "grad_norm": 1.5965397357940674, "learning_rate": 5.60908060949881e-06, "loss": 0.2741, "num_input_tokens_seen": 34325440, "step": 34120 }, { "epoch": 16.08910891089109, "grad_norm": 0.06813092529773712, "learning_rate": 5.602589935496716e-06, "loss": 0.0305, "num_input_tokens_seen": 34330400, "step": 34125 }, { "epoch": 16.091466289486092, "grad_norm": 1.1566933393478394, "learning_rate": 5.5961025451468026e-06, "loss": 0.1286, "num_input_tokens_seen": 34335744, "step": 34130 }, { "epoch": 16.093823668081093, "grad_norm": 1.0856138467788696, "learning_rate": 5.5896184395472615e-06, "loss": 0.1021, "num_input_tokens_seen": 34341056, "step": 34135 }, { "epoch": 16.096181046676097, "grad_norm": 0.4853454828262329, "learning_rate": 5.58313761979575e-06, "loss": 0.1258, "num_input_tokens_seen": 34346080, "step": 34140 }, { "epoch": 16.098538425271098, "grad_norm": 0.7569161057472229, "learning_rate": 5.576660086989358e-06, "loss": 0.1158, "num_input_tokens_seen": 34350624, "step": 34145 }, { "epoch": 16.100895803866102, "grad_norm": 0.9913995265960693, "learning_rate": 5.570185842224626e-06, "loss": 0.0929, "num_input_tokens_seen": 34357472, "step": 34150 }, { "epoch": 16.103253182461103, "grad_norm": 0.05202333629131317, "learning_rate": 5.563714886597529e-06, "loss": 0.0446, "num_input_tokens_seen": 34362336, "step": 34155 }, { "epoch": 16.105610561056107, "grad_norm": 0.18320924043655396, "learning_rate": 5.557247221203501e-06, "loss": 0.0584, "num_input_tokens_seen": 34367104, "step": 34160 }, { "epoch": 16.107967939651108, "grad_norm": 0.513200581073761, "learning_rate": 5.5507828471373926e-06, "loss": 0.1392, "num_input_tokens_seen": 34371616, "step": 34165 }, { "epoch": 16.11032531824611, "grad_norm": 0.8269887566566467, "learning_rate": 5.544321765493521e-06, "loss": 0.222, "num_input_tokens_seen": 34376928, "step": 34170 }, { "epoch": 16.112682696841112, "grad_norm": 1.3992338180541992, "learning_rate": 5.5378639773656345e-06, "loss": 0.1552, "num_input_tokens_seen": 34381440, "step": 34175 }, { "epoch": 16.115040075436116, "grad_norm": 0.40420758724212646, "learning_rate": 5.531409483846933e-06, "loss": 0.2087, "num_input_tokens_seen": 34387072, "step": 34180 }, { "epoch": 16.117397454031117, "grad_norm": 0.23419733345508575, "learning_rate": 5.5249582860300435e-06, "loss": 0.089, "num_input_tokens_seen": 34392128, "step": 34185 }, { "epoch": 16.11975483262612, "grad_norm": 0.3414984941482544, "learning_rate": 5.518510385007048e-06, "loss": 0.1109, "num_input_tokens_seen": 34396096, "step": 34190 }, { "epoch": 16.122112211221122, "grad_norm": 0.45139947533607483, "learning_rate": 5.512065781869466e-06, "loss": 0.0616, "num_input_tokens_seen": 34400480, "step": 34195 }, { "epoch": 16.124469589816126, "grad_norm": 0.25269919633865356, "learning_rate": 5.505624477708263e-06, "loss": 0.0854, "num_input_tokens_seen": 34406304, "step": 34200 }, { "epoch": 16.126826968411127, "grad_norm": 0.8264654874801636, "learning_rate": 5.499186473613841e-06, "loss": 0.1266, "num_input_tokens_seen": 34412704, "step": 34205 }, { "epoch": 16.12918434700613, "grad_norm": 0.8434129953384399, "learning_rate": 5.492751770676041e-06, "loss": 0.0733, "num_input_tokens_seen": 34417248, "step": 34210 }, { "epoch": 16.13154172560113, "grad_norm": 0.03718583285808563, "learning_rate": 5.486320369984155e-06, "loss": 0.0853, "num_input_tokens_seen": 34421376, "step": 34215 }, { "epoch": 16.133899104196132, "grad_norm": 0.08958835154771805, "learning_rate": 5.479892272626913e-06, "loss": 0.1777, "num_input_tokens_seen": 34425024, "step": 34220 }, { "epoch": 16.136256482791136, "grad_norm": 0.3504835367202759, "learning_rate": 5.473467479692468e-06, "loss": 0.2043, "num_input_tokens_seen": 34432704, "step": 34225 }, { "epoch": 16.138613861386137, "grad_norm": 1.9597058296203613, "learning_rate": 5.467045992268438e-06, "loss": 0.0981, "num_input_tokens_seen": 34437856, "step": 34230 }, { "epoch": 16.14097123998114, "grad_norm": 1.1881555318832397, "learning_rate": 5.460627811441871e-06, "loss": 0.1507, "num_input_tokens_seen": 34442432, "step": 34235 }, { "epoch": 16.14332861857614, "grad_norm": 1.5023759603500366, "learning_rate": 5.454212938299255e-06, "loss": 0.2018, "num_input_tokens_seen": 34447488, "step": 34240 }, { "epoch": 16.145685997171146, "grad_norm": 2.318351984024048, "learning_rate": 5.447801373926522e-06, "loss": 0.1944, "num_input_tokens_seen": 34451904, "step": 34245 }, { "epoch": 16.148043375766147, "grad_norm": 0.9270391464233398, "learning_rate": 5.441393119409038e-06, "loss": 0.1476, "num_input_tokens_seen": 34457152, "step": 34250 }, { "epoch": 16.15040075436115, "grad_norm": 0.045386072248220444, "learning_rate": 5.434988175831612e-06, "loss": 0.173, "num_input_tokens_seen": 34461376, "step": 34255 }, { "epoch": 16.15275813295615, "grad_norm": 0.02015824057161808, "learning_rate": 5.428586544278502e-06, "loss": 0.0381, "num_input_tokens_seen": 34465984, "step": 34260 }, { "epoch": 16.155115511551156, "grad_norm": 0.3883119225502014, "learning_rate": 5.422188225833377e-06, "loss": 0.1428, "num_input_tokens_seen": 34471200, "step": 34265 }, { "epoch": 16.157472890146156, "grad_norm": 0.6426939964294434, "learning_rate": 5.415793221579374e-06, "loss": 0.1241, "num_input_tokens_seen": 34475808, "step": 34270 }, { "epoch": 16.15983026874116, "grad_norm": 0.2549719512462616, "learning_rate": 5.409401532599057e-06, "loss": 0.0583, "num_input_tokens_seen": 34480064, "step": 34275 }, { "epoch": 16.16218764733616, "grad_norm": 1.0179696083068848, "learning_rate": 5.403013159974432e-06, "loss": 0.0868, "num_input_tokens_seen": 34485280, "step": 34280 }, { "epoch": 16.164545025931165, "grad_norm": 2.4787206649780273, "learning_rate": 5.396628104786941e-06, "loss": 0.1164, "num_input_tokens_seen": 34491008, "step": 34285 }, { "epoch": 16.166902404526166, "grad_norm": 0.02104436606168747, "learning_rate": 5.390246368117466e-06, "loss": 0.1008, "num_input_tokens_seen": 34495936, "step": 34290 }, { "epoch": 16.16925978312117, "grad_norm": 1.3571441173553467, "learning_rate": 5.383867951046326e-06, "loss": 0.1495, "num_input_tokens_seen": 34501440, "step": 34295 }, { "epoch": 16.17161716171617, "grad_norm": 0.037755705416202545, "learning_rate": 5.377492854653285e-06, "loss": 0.0371, "num_input_tokens_seen": 34506112, "step": 34300 }, { "epoch": 16.173974540311175, "grad_norm": 0.015192114748060703, "learning_rate": 5.371121080017525e-06, "loss": 0.1251, "num_input_tokens_seen": 34510624, "step": 34305 }, { "epoch": 16.176331918906175, "grad_norm": 1.4697152376174927, "learning_rate": 5.364752628217687e-06, "loss": 0.3307, "num_input_tokens_seen": 34514688, "step": 34310 }, { "epoch": 16.17868929750118, "grad_norm": 0.023007752373814583, "learning_rate": 5.358387500331843e-06, "loss": 0.036, "num_input_tokens_seen": 34519872, "step": 34315 }, { "epoch": 16.18104667609618, "grad_norm": 0.11363378167152405, "learning_rate": 5.3520256974374986e-06, "loss": 0.0216, "num_input_tokens_seen": 34524800, "step": 34320 }, { "epoch": 16.183404054691184, "grad_norm": 0.12814904749393463, "learning_rate": 5.3456672206116e-06, "loss": 0.0243, "num_input_tokens_seen": 34530304, "step": 34325 }, { "epoch": 16.185761433286185, "grad_norm": 0.024919582530856133, "learning_rate": 5.33931207093053e-06, "loss": 0.0366, "num_input_tokens_seen": 34534528, "step": 34330 }, { "epoch": 16.18811881188119, "grad_norm": 0.1343991756439209, "learning_rate": 5.332960249470106e-06, "loss": 0.024, "num_input_tokens_seen": 34539520, "step": 34335 }, { "epoch": 16.19047619047619, "grad_norm": 0.8815358281135559, "learning_rate": 5.326611757305591e-06, "loss": 0.1922, "num_input_tokens_seen": 34544352, "step": 34340 }, { "epoch": 16.192833569071194, "grad_norm": 1.4986205101013184, "learning_rate": 5.320266595511666e-06, "loss": 0.1605, "num_input_tokens_seen": 34549184, "step": 34345 }, { "epoch": 16.195190947666195, "grad_norm": 0.126688614487648, "learning_rate": 5.313924765162462e-06, "loss": 0.1146, "num_input_tokens_seen": 34553248, "step": 34350 }, { "epoch": 16.1975483262612, "grad_norm": 0.027773646637797356, "learning_rate": 5.307586267331543e-06, "loss": 0.0109, "num_input_tokens_seen": 34560576, "step": 34355 }, { "epoch": 16.1999057048562, "grad_norm": 0.26711413264274597, "learning_rate": 5.301251103091915e-06, "loss": 0.1177, "num_input_tokens_seen": 34566112, "step": 34360 }, { "epoch": 16.202263083451204, "grad_norm": 1.8019524812698364, "learning_rate": 5.2949192735160055e-06, "loss": 0.143, "num_input_tokens_seen": 34571200, "step": 34365 }, { "epoch": 16.204620462046204, "grad_norm": 0.9066960215568542, "learning_rate": 5.288590779675692e-06, "loss": 0.1977, "num_input_tokens_seen": 34575744, "step": 34370 }, { "epoch": 16.20697784064121, "grad_norm": 0.06837456673383713, "learning_rate": 5.2822656226422765e-06, "loss": 0.0352, "num_input_tokens_seen": 34581216, "step": 34375 }, { "epoch": 16.20933521923621, "grad_norm": 0.18987935781478882, "learning_rate": 5.275943803486513e-06, "loss": 0.1573, "num_input_tokens_seen": 34586368, "step": 34380 }, { "epoch": 16.211692597831213, "grad_norm": 0.42461758852005005, "learning_rate": 5.269625323278565e-06, "loss": 0.0197, "num_input_tokens_seen": 34590752, "step": 34385 }, { "epoch": 16.214049976426214, "grad_norm": 1.3588086366653442, "learning_rate": 5.263310183088043e-06, "loss": 0.3197, "num_input_tokens_seen": 34595712, "step": 34390 }, { "epoch": 16.216407355021218, "grad_norm": 0.7704369425773621, "learning_rate": 5.256998383983997e-06, "loss": 0.0972, "num_input_tokens_seen": 34600512, "step": 34395 }, { "epoch": 16.21876473361622, "grad_norm": 0.32195577025413513, "learning_rate": 5.2506899270349045e-06, "loss": 0.196, "num_input_tokens_seen": 34606112, "step": 34400 }, { "epoch": 16.221122112211223, "grad_norm": 1.2141880989074707, "learning_rate": 5.244384813308687e-06, "loss": 0.1555, "num_input_tokens_seen": 34611840, "step": 34405 }, { "epoch": 16.223479490806223, "grad_norm": 0.03362058103084564, "learning_rate": 5.238083043872686e-06, "loss": 0.0642, "num_input_tokens_seen": 34616864, "step": 34410 }, { "epoch": 16.225836869401228, "grad_norm": 1.5110059976577759, "learning_rate": 5.231784619793689e-06, "loss": 0.0904, "num_input_tokens_seen": 34622720, "step": 34415 }, { "epoch": 16.22819424799623, "grad_norm": 1.7421886920928955, "learning_rate": 5.225489542137909e-06, "loss": 0.1034, "num_input_tokens_seen": 34628704, "step": 34420 }, { "epoch": 16.23055162659123, "grad_norm": 1.6267229318618774, "learning_rate": 5.219197811971008e-06, "loss": 0.1825, "num_input_tokens_seen": 34633504, "step": 34425 }, { "epoch": 16.232909005186233, "grad_norm": 0.037427276372909546, "learning_rate": 5.212909430358049e-06, "loss": 0.168, "num_input_tokens_seen": 34639360, "step": 34430 }, { "epoch": 16.235266383781234, "grad_norm": 1.328182578086853, "learning_rate": 5.206624398363558e-06, "loss": 0.1191, "num_input_tokens_seen": 34644960, "step": 34435 }, { "epoch": 16.237623762376238, "grad_norm": 0.5885509848594666, "learning_rate": 5.200342717051484e-06, "loss": 0.0422, "num_input_tokens_seen": 34649376, "step": 34440 }, { "epoch": 16.23998114097124, "grad_norm": 1.309482216835022, "learning_rate": 5.194064387485209e-06, "loss": 0.0927, "num_input_tokens_seen": 34653984, "step": 34445 }, { "epoch": 16.242338519566243, "grad_norm": 0.0785181075334549, "learning_rate": 5.187789410727548e-06, "loss": 0.2064, "num_input_tokens_seen": 34659552, "step": 34450 }, { "epoch": 16.244695898161243, "grad_norm": 0.17987094819545746, "learning_rate": 5.181517787840745e-06, "loss": 0.0718, "num_input_tokens_seen": 34664544, "step": 34455 }, { "epoch": 16.247053276756247, "grad_norm": 0.24138951301574707, "learning_rate": 5.175249519886485e-06, "loss": 0.0391, "num_input_tokens_seen": 34670240, "step": 34460 }, { "epoch": 16.249410655351248, "grad_norm": 0.7368974089622498, "learning_rate": 5.168984607925884e-06, "loss": 0.0363, "num_input_tokens_seen": 34675072, "step": 34465 }, { "epoch": 16.251768033946252, "grad_norm": 0.5526741743087769, "learning_rate": 5.162723053019467e-06, "loss": 0.124, "num_input_tokens_seen": 34680032, "step": 34470 }, { "epoch": 16.254125412541253, "grad_norm": 0.09730220586061478, "learning_rate": 5.156464856227223e-06, "loss": 0.2329, "num_input_tokens_seen": 34685632, "step": 34475 }, { "epoch": 16.256482791136257, "grad_norm": 2.1298506259918213, "learning_rate": 5.150210018608551e-06, "loss": 0.1563, "num_input_tokens_seen": 34692000, "step": 34480 }, { "epoch": 16.258840169731258, "grad_norm": 2.7336366176605225, "learning_rate": 5.143958541222293e-06, "loss": 0.1583, "num_input_tokens_seen": 34696192, "step": 34485 }, { "epoch": 16.261197548326262, "grad_norm": 0.3374651074409485, "learning_rate": 5.137710425126721e-06, "loss": 0.0291, "num_input_tokens_seen": 34700896, "step": 34490 }, { "epoch": 16.263554926921262, "grad_norm": 1.1738805770874023, "learning_rate": 5.131465671379529e-06, "loss": 0.0549, "num_input_tokens_seen": 34705792, "step": 34495 }, { "epoch": 16.265912305516267, "grad_norm": 2.3341922760009766, "learning_rate": 5.125224281037852e-06, "loss": 0.2909, "num_input_tokens_seen": 34710496, "step": 34500 }, { "epoch": 16.268269684111267, "grad_norm": 0.2662108540534973, "learning_rate": 5.118986255158253e-06, "loss": 0.0877, "num_input_tokens_seen": 34714560, "step": 34505 }, { "epoch": 16.27062706270627, "grad_norm": 2.0579674243927, "learning_rate": 5.112751594796717e-06, "loss": 0.2174, "num_input_tokens_seen": 34719552, "step": 34510 }, { "epoch": 16.272984441301272, "grad_norm": 1.2626734972000122, "learning_rate": 5.1065203010086685e-06, "loss": 0.1772, "num_input_tokens_seen": 34725504, "step": 34515 }, { "epoch": 16.275341819896276, "grad_norm": 0.05395497381687164, "learning_rate": 5.100292374848961e-06, "loss": 0.045, "num_input_tokens_seen": 34730432, "step": 34520 }, { "epoch": 16.277699198491277, "grad_norm": 0.4276283085346222, "learning_rate": 5.0940678173718785e-06, "loss": 0.1181, "num_input_tokens_seen": 34735488, "step": 34525 }, { "epoch": 16.28005657708628, "grad_norm": 0.8710120916366577, "learning_rate": 5.08784662963113e-06, "loss": 0.0788, "num_input_tokens_seen": 34741440, "step": 34530 }, { "epoch": 16.28241395568128, "grad_norm": 0.09338261187076569, "learning_rate": 5.081628812679856e-06, "loss": 0.151, "num_input_tokens_seen": 34747936, "step": 34535 }, { "epoch": 16.284771334276286, "grad_norm": 3.2595531940460205, "learning_rate": 5.075414367570633e-06, "loss": 0.2542, "num_input_tokens_seen": 34753024, "step": 34540 }, { "epoch": 16.287128712871286, "grad_norm": 0.12274543941020966, "learning_rate": 5.069203295355463e-06, "loss": 0.1379, "num_input_tokens_seen": 34757600, "step": 34545 }, { "epoch": 16.28948609146629, "grad_norm": 0.016585176810622215, "learning_rate": 5.062995597085759e-06, "loss": 0.1103, "num_input_tokens_seen": 34762720, "step": 34550 }, { "epoch": 16.29184347006129, "grad_norm": 1.127367377281189, "learning_rate": 5.0567912738123914e-06, "loss": 0.0921, "num_input_tokens_seen": 34767008, "step": 34555 }, { "epoch": 16.294200848656295, "grad_norm": 0.4971105754375458, "learning_rate": 5.050590326585647e-06, "loss": 0.1407, "num_input_tokens_seen": 34772608, "step": 34560 }, { "epoch": 16.296558227251296, "grad_norm": 0.23804908990859985, "learning_rate": 5.044392756455238e-06, "loss": 0.1236, "num_input_tokens_seen": 34777280, "step": 34565 }, { "epoch": 16.2989156058463, "grad_norm": 0.5046254992485046, "learning_rate": 5.038198564470306e-06, "loss": 0.0475, "num_input_tokens_seen": 34783104, "step": 34570 }, { "epoch": 16.3012729844413, "grad_norm": 0.16893704235553741, "learning_rate": 5.032007751679426e-06, "loss": 0.1252, "num_input_tokens_seen": 34788448, "step": 34575 }, { "epoch": 16.303630363036305, "grad_norm": 1.7372533082962036, "learning_rate": 5.025820319130597e-06, "loss": 0.2755, "num_input_tokens_seen": 34793728, "step": 34580 }, { "epoch": 16.305987741631306, "grad_norm": 0.644968569278717, "learning_rate": 5.019636267871253e-06, "loss": 0.1725, "num_input_tokens_seen": 34799008, "step": 34585 }, { "epoch": 16.30834512022631, "grad_norm": 0.27162376046180725, "learning_rate": 5.013455598948233e-06, "loss": 0.1099, "num_input_tokens_seen": 34804544, "step": 34590 }, { "epoch": 16.31070249882131, "grad_norm": 0.42566266655921936, "learning_rate": 5.0072783134078285e-06, "loss": 0.1256, "num_input_tokens_seen": 34809856, "step": 34595 }, { "epoch": 16.313059877416315, "grad_norm": 0.7297952771186829, "learning_rate": 5.0011044122957515e-06, "loss": 0.1048, "num_input_tokens_seen": 34814816, "step": 34600 }, { "epoch": 16.315417256011315, "grad_norm": 0.37265342473983765, "learning_rate": 4.994933896657131e-06, "loss": 0.1666, "num_input_tokens_seen": 34821024, "step": 34605 }, { "epoch": 16.31777463460632, "grad_norm": 0.1806603968143463, "learning_rate": 4.988766767536546e-06, "loss": 0.0892, "num_input_tokens_seen": 34825376, "step": 34610 }, { "epoch": 16.32013201320132, "grad_norm": 0.7923404574394226, "learning_rate": 4.982603025977966e-06, "loss": 0.0779, "num_input_tokens_seen": 34831168, "step": 34615 }, { "epoch": 16.32248939179632, "grad_norm": 0.14785915613174438, "learning_rate": 4.976442673024817e-06, "loss": 0.0403, "num_input_tokens_seen": 34836384, "step": 34620 }, { "epoch": 16.324846770391325, "grad_norm": 0.21801011264324188, "learning_rate": 4.970285709719946e-06, "loss": 0.058, "num_input_tokens_seen": 34842432, "step": 34625 }, { "epoch": 16.327204148986326, "grad_norm": 0.006731722503900528, "learning_rate": 4.964132137105618e-06, "loss": 0.0251, "num_input_tokens_seen": 34846816, "step": 34630 }, { "epoch": 16.32956152758133, "grad_norm": 1.40487539768219, "learning_rate": 4.9579819562235345e-06, "loss": 0.2566, "num_input_tokens_seen": 34851712, "step": 34635 }, { "epoch": 16.33191890617633, "grad_norm": 0.023796766996383667, "learning_rate": 4.951835168114807e-06, "loss": 0.0579, "num_input_tokens_seen": 34856960, "step": 34640 }, { "epoch": 16.334276284771335, "grad_norm": 1.0982412099838257, "learning_rate": 4.945691773819985e-06, "loss": 0.0797, "num_input_tokens_seen": 34862048, "step": 34645 }, { "epoch": 16.336633663366335, "grad_norm": 2.0575664043426514, "learning_rate": 4.9395517743790446e-06, "loss": 0.0867, "num_input_tokens_seen": 34867360, "step": 34650 }, { "epoch": 16.33899104196134, "grad_norm": 2.6773734092712402, "learning_rate": 4.933415170831382e-06, "loss": 0.0754, "num_input_tokens_seen": 34871552, "step": 34655 }, { "epoch": 16.34134842055634, "grad_norm": 1.3824059963226318, "learning_rate": 4.9272819642158185e-06, "loss": 0.1095, "num_input_tokens_seen": 34875520, "step": 34660 }, { "epoch": 16.343705799151344, "grad_norm": 1.3530523777008057, "learning_rate": 4.921152155570604e-06, "loss": 0.0514, "num_input_tokens_seen": 34880608, "step": 34665 }, { "epoch": 16.346063177746345, "grad_norm": 0.5963668823242188, "learning_rate": 4.91502574593341e-06, "loss": 0.1251, "num_input_tokens_seen": 34885728, "step": 34670 }, { "epoch": 16.34842055634135, "grad_norm": 0.2656092941761017, "learning_rate": 4.908902736341342e-06, "loss": 0.1817, "num_input_tokens_seen": 34890656, "step": 34675 }, { "epoch": 16.35077793493635, "grad_norm": 1.3552347421646118, "learning_rate": 4.902783127830904e-06, "loss": 0.1624, "num_input_tokens_seen": 34898112, "step": 34680 }, { "epoch": 16.353135313531354, "grad_norm": 0.06728129833936691, "learning_rate": 4.896666921438053e-06, "loss": 0.0672, "num_input_tokens_seen": 34903328, "step": 34685 }, { "epoch": 16.355492692126354, "grad_norm": 1.4100455045700073, "learning_rate": 4.890554118198157e-06, "loss": 0.2317, "num_input_tokens_seen": 34908928, "step": 34690 }, { "epoch": 16.35785007072136, "grad_norm": 1.0002498626708984, "learning_rate": 4.8844447191460085e-06, "loss": 0.0949, "num_input_tokens_seen": 34914784, "step": 34695 }, { "epoch": 16.36020744931636, "grad_norm": 0.1068594828248024, "learning_rate": 4.878338725315826e-06, "loss": 0.0073, "num_input_tokens_seen": 34919136, "step": 34700 }, { "epoch": 16.362564827911363, "grad_norm": 0.03641023486852646, "learning_rate": 4.872236137741251e-06, "loss": 0.0103, "num_input_tokens_seen": 34925088, "step": 34705 }, { "epoch": 16.364922206506364, "grad_norm": 1.6879266500473022, "learning_rate": 4.86613695745535e-06, "loss": 0.1566, "num_input_tokens_seen": 34930592, "step": 34710 }, { "epoch": 16.367279585101368, "grad_norm": 1.7959325313568115, "learning_rate": 4.860041185490613e-06, "loss": 0.1349, "num_input_tokens_seen": 34935488, "step": 34715 }, { "epoch": 16.36963696369637, "grad_norm": 1.5802574157714844, "learning_rate": 4.853948822878943e-06, "loss": 0.0746, "num_input_tokens_seen": 34941152, "step": 34720 }, { "epoch": 16.371994342291373, "grad_norm": 0.04962825030088425, "learning_rate": 4.847859870651672e-06, "loss": 0.0339, "num_input_tokens_seen": 34946496, "step": 34725 }, { "epoch": 16.374351720886374, "grad_norm": 0.24276603758335114, "learning_rate": 4.841774329839563e-06, "loss": 0.1559, "num_input_tokens_seen": 34951776, "step": 34730 }, { "epoch": 16.376709099481378, "grad_norm": 2.0774149894714355, "learning_rate": 4.8356922014727944e-06, "loss": 0.2066, "num_input_tokens_seen": 34956064, "step": 34735 }, { "epoch": 16.37906647807638, "grad_norm": 0.06269657611846924, "learning_rate": 4.829613486580964e-06, "loss": 0.2244, "num_input_tokens_seen": 34961248, "step": 34740 }, { "epoch": 16.381423856671383, "grad_norm": 1.4856131076812744, "learning_rate": 4.823538186193097e-06, "loss": 0.1272, "num_input_tokens_seen": 34965856, "step": 34745 }, { "epoch": 16.383781235266383, "grad_norm": 0.13422128558158875, "learning_rate": 4.817466301337639e-06, "loss": 0.0369, "num_input_tokens_seen": 34970752, "step": 34750 }, { "epoch": 16.386138613861387, "grad_norm": 1.0752590894699097, "learning_rate": 4.811397833042461e-06, "loss": 0.1556, "num_input_tokens_seen": 34975104, "step": 34755 }, { "epoch": 16.388495992456388, "grad_norm": 0.8112874627113342, "learning_rate": 4.805332782334842e-06, "loss": 0.047, "num_input_tokens_seen": 34980128, "step": 34760 }, { "epoch": 16.390853371051392, "grad_norm": 2.516058921813965, "learning_rate": 4.799271150241496e-06, "loss": 0.2027, "num_input_tokens_seen": 34985408, "step": 34765 }, { "epoch": 16.393210749646393, "grad_norm": 1.2693774700164795, "learning_rate": 4.793212937788558e-06, "loss": 0.2458, "num_input_tokens_seen": 34990944, "step": 34770 }, { "epoch": 16.395568128241397, "grad_norm": 0.12785208225250244, "learning_rate": 4.787158146001575e-06, "loss": 0.0406, "num_input_tokens_seen": 34995584, "step": 34775 }, { "epoch": 16.397925506836398, "grad_norm": 2.093775749206543, "learning_rate": 4.781106775905525e-06, "loss": 0.1439, "num_input_tokens_seen": 35001696, "step": 34780 }, { "epoch": 16.400282885431402, "grad_norm": 1.0039178133010864, "learning_rate": 4.775058828524801e-06, "loss": 0.1372, "num_input_tokens_seen": 35006688, "step": 34785 }, { "epoch": 16.402640264026402, "grad_norm": 0.22187675535678864, "learning_rate": 4.7690143048832154e-06, "loss": 0.0331, "num_input_tokens_seen": 35012128, "step": 34790 }, { "epoch": 16.404997642621407, "grad_norm": 0.3980828523635864, "learning_rate": 4.762973206004012e-06, "loss": 0.0329, "num_input_tokens_seen": 35016128, "step": 34795 }, { "epoch": 16.407355021216407, "grad_norm": 0.8492836356163025, "learning_rate": 4.756935532909834e-06, "loss": 0.1028, "num_input_tokens_seen": 35020032, "step": 34800 }, { "epoch": 16.40971239981141, "grad_norm": 2.5237483978271484, "learning_rate": 4.7509012866227635e-06, "loss": 0.0993, "num_input_tokens_seen": 35024480, "step": 34805 }, { "epoch": 16.412069778406412, "grad_norm": 0.3157539367675781, "learning_rate": 4.744870468164292e-06, "loss": 0.068, "num_input_tokens_seen": 35029664, "step": 34810 }, { "epoch": 16.414427157001413, "grad_norm": 0.235783651471138, "learning_rate": 4.738843078555341e-06, "loss": 0.1335, "num_input_tokens_seen": 35034528, "step": 34815 }, { "epoch": 16.416784535596417, "grad_norm": 0.07048333436250687, "learning_rate": 4.73281911881624e-06, "loss": 0.0709, "num_input_tokens_seen": 35038880, "step": 34820 }, { "epoch": 16.419141914191417, "grad_norm": 2.2180209159851074, "learning_rate": 4.726798589966747e-06, "loss": 0.1326, "num_input_tokens_seen": 35044032, "step": 34825 }, { "epoch": 16.42149929278642, "grad_norm": 0.25124695897102356, "learning_rate": 4.7207814930260305e-06, "loss": 0.1878, "num_input_tokens_seen": 35049120, "step": 34830 }, { "epoch": 16.423856671381422, "grad_norm": 0.5089643001556396, "learning_rate": 4.714767829012695e-06, "loss": 0.1728, "num_input_tokens_seen": 35054848, "step": 34835 }, { "epoch": 16.426214049976426, "grad_norm": 1.1790515184402466, "learning_rate": 4.708757598944735e-06, "loss": 0.0713, "num_input_tokens_seen": 35059904, "step": 34840 }, { "epoch": 16.428571428571427, "grad_norm": 1.4306689500808716, "learning_rate": 4.702750803839592e-06, "loss": 0.1423, "num_input_tokens_seen": 35064992, "step": 34845 }, { "epoch": 16.43092880716643, "grad_norm": 0.013576250523328781, "learning_rate": 4.6967474447141055e-06, "loss": 0.0104, "num_input_tokens_seen": 35069952, "step": 34850 }, { "epoch": 16.433286185761432, "grad_norm": 0.3438913822174072, "learning_rate": 4.6907475225845485e-06, "loss": 0.0682, "num_input_tokens_seen": 35074368, "step": 34855 }, { "epoch": 16.435643564356436, "grad_norm": 0.7471536993980408, "learning_rate": 4.684751038466603e-06, "loss": 0.1843, "num_input_tokens_seen": 35079680, "step": 34860 }, { "epoch": 16.438000942951437, "grad_norm": 1.2047662734985352, "learning_rate": 4.678757993375374e-06, "loss": 0.2514, "num_input_tokens_seen": 35083840, "step": 34865 }, { "epoch": 16.44035832154644, "grad_norm": 0.5606820583343506, "learning_rate": 4.672768388325382e-06, "loss": 0.0659, "num_input_tokens_seen": 35089184, "step": 34870 }, { "epoch": 16.44271570014144, "grad_norm": 1.2633916139602661, "learning_rate": 4.666782224330563e-06, "loss": 0.0749, "num_input_tokens_seen": 35094400, "step": 34875 }, { "epoch": 16.445073078736446, "grad_norm": 0.09194227308034897, "learning_rate": 4.660799502404278e-06, "loss": 0.1288, "num_input_tokens_seen": 35099552, "step": 34880 }, { "epoch": 16.447430457331446, "grad_norm": 1.4864420890808105, "learning_rate": 4.6548202235593e-06, "loss": 0.071, "num_input_tokens_seen": 35104512, "step": 34885 }, { "epoch": 16.44978783592645, "grad_norm": 0.055646128952503204, "learning_rate": 4.6488443888078136e-06, "loss": 0.0689, "num_input_tokens_seen": 35111584, "step": 34890 }, { "epoch": 16.45214521452145, "grad_norm": 0.08879249542951584, "learning_rate": 4.642871999161427e-06, "loss": 0.1795, "num_input_tokens_seen": 35115744, "step": 34895 }, { "epoch": 16.454502593116455, "grad_norm": 0.12996982038021088, "learning_rate": 4.63690305563117e-06, "loss": 0.072, "num_input_tokens_seen": 35121440, "step": 34900 }, { "epoch": 16.456859971711456, "grad_norm": 0.7701950669288635, "learning_rate": 4.630937559227477e-06, "loss": 0.1258, "num_input_tokens_seen": 35126560, "step": 34905 }, { "epoch": 16.45921735030646, "grad_norm": 0.0622214749455452, "learning_rate": 4.6249755109602085e-06, "loss": 0.1158, "num_input_tokens_seen": 35130560, "step": 34910 }, { "epoch": 16.46157472890146, "grad_norm": 0.2690730392932892, "learning_rate": 4.619016911838639e-06, "loss": 0.2744, "num_input_tokens_seen": 35134976, "step": 34915 }, { "epoch": 16.463932107496465, "grad_norm": 0.29333001375198364, "learning_rate": 4.613061762871454e-06, "loss": 0.0937, "num_input_tokens_seen": 35139456, "step": 34920 }, { "epoch": 16.466289486091465, "grad_norm": 1.7633904218673706, "learning_rate": 4.607110065066772e-06, "loss": 0.0927, "num_input_tokens_seen": 35144288, "step": 34925 }, { "epoch": 16.46864686468647, "grad_norm": 0.3143348693847656, "learning_rate": 4.601161819432096e-06, "loss": 0.1498, "num_input_tokens_seen": 35149568, "step": 34930 }, { "epoch": 16.47100424328147, "grad_norm": 1.1827397346496582, "learning_rate": 4.595217026974372e-06, "loss": 0.2124, "num_input_tokens_seen": 35154240, "step": 34935 }, { "epoch": 16.473361621876474, "grad_norm": 1.4417392015457153, "learning_rate": 4.589275688699951e-06, "loss": 0.0555, "num_input_tokens_seen": 35158048, "step": 34940 }, { "epoch": 16.475719000471475, "grad_norm": 1.029930830001831, "learning_rate": 4.583337805614602e-06, "loss": 0.1616, "num_input_tokens_seen": 35163328, "step": 34945 }, { "epoch": 16.47807637906648, "grad_norm": 2.978207588195801, "learning_rate": 4.577403378723507e-06, "loss": 0.1574, "num_input_tokens_seen": 35167840, "step": 34950 }, { "epoch": 16.48043375766148, "grad_norm": 0.19344250857830048, "learning_rate": 4.571472409031266e-06, "loss": 0.0554, "num_input_tokens_seen": 35174144, "step": 34955 }, { "epoch": 16.482791136256484, "grad_norm": 0.0075540440157055855, "learning_rate": 4.565544897541887e-06, "loss": 0.0468, "num_input_tokens_seen": 35179424, "step": 34960 }, { "epoch": 16.485148514851485, "grad_norm": 0.11209213733673096, "learning_rate": 4.559620845258808e-06, "loss": 0.0551, "num_input_tokens_seen": 35183808, "step": 34965 }, { "epoch": 16.48750589344649, "grad_norm": 0.11852212250232697, "learning_rate": 4.553700253184853e-06, "loss": 0.0453, "num_input_tokens_seen": 35188032, "step": 34970 }, { "epoch": 16.48986327204149, "grad_norm": 0.17881599068641663, "learning_rate": 4.547783122322291e-06, "loss": 0.1656, "num_input_tokens_seen": 35192256, "step": 34975 }, { "epoch": 16.492220650636494, "grad_norm": 1.3130933046340942, "learning_rate": 4.5418694536727844e-06, "loss": 0.2775, "num_input_tokens_seen": 35198432, "step": 34980 }, { "epoch": 16.494578029231494, "grad_norm": 0.7306278347969055, "learning_rate": 4.535959248237421e-06, "loss": 0.0873, "num_input_tokens_seen": 35203200, "step": 34985 }, { "epoch": 16.4969354078265, "grad_norm": 2.828643321990967, "learning_rate": 4.5300525070166965e-06, "loss": 0.1858, "num_input_tokens_seen": 35207616, "step": 34990 }, { "epoch": 16.4992927864215, "grad_norm": 1.7851495742797852, "learning_rate": 4.524149231010522e-06, "loss": 0.115, "num_input_tokens_seen": 35212352, "step": 34995 }, { "epoch": 16.501650165016503, "grad_norm": 0.5343672633171082, "learning_rate": 4.5182494212182234e-06, "loss": 0.0643, "num_input_tokens_seen": 35216448, "step": 35000 }, { "epoch": 16.504007543611504, "grad_norm": 0.24634215235710144, "learning_rate": 4.512353078638543e-06, "loss": 0.0463, "num_input_tokens_seen": 35220960, "step": 35005 }, { "epoch": 16.506364922206508, "grad_norm": 0.05600941926240921, "learning_rate": 4.506460204269622e-06, "loss": 0.1451, "num_input_tokens_seen": 35226528, "step": 35010 }, { "epoch": 16.50872230080151, "grad_norm": 1.6487685441970825, "learning_rate": 4.500570799109024e-06, "loss": 0.1979, "num_input_tokens_seen": 35231648, "step": 35015 }, { "epoch": 16.51107967939651, "grad_norm": 0.20218558609485626, "learning_rate": 4.494684864153734e-06, "loss": 0.1189, "num_input_tokens_seen": 35236320, "step": 35020 }, { "epoch": 16.513437057991514, "grad_norm": 0.16215790808200836, "learning_rate": 4.488802400400133e-06, "loss": 0.0241, "num_input_tokens_seen": 35240608, "step": 35025 }, { "epoch": 16.515794436586514, "grad_norm": 0.06966190785169601, "learning_rate": 4.482923408844025e-06, "loss": 0.0682, "num_input_tokens_seen": 35247072, "step": 35030 }, { "epoch": 16.51815181518152, "grad_norm": 0.881271481513977, "learning_rate": 4.477047890480626e-06, "loss": 0.1303, "num_input_tokens_seen": 35251488, "step": 35035 }, { "epoch": 16.52050919377652, "grad_norm": 1.036490559577942, "learning_rate": 4.471175846304557e-06, "loss": 0.0956, "num_input_tokens_seen": 35255968, "step": 35040 }, { "epoch": 16.522866572371523, "grad_norm": 0.13288530707359314, "learning_rate": 4.465307277309866e-06, "loss": 0.1036, "num_input_tokens_seen": 35261376, "step": 35045 }, { "epoch": 16.525223950966524, "grad_norm": 0.8408312201499939, "learning_rate": 4.459442184489985e-06, "loss": 0.0558, "num_input_tokens_seen": 35266272, "step": 35050 }, { "epoch": 16.527581329561528, "grad_norm": 1.4115573167800903, "learning_rate": 4.453580568837784e-06, "loss": 0.312, "num_input_tokens_seen": 35271328, "step": 35055 }, { "epoch": 16.52993870815653, "grad_norm": 0.04287402331829071, "learning_rate": 4.447722431345536e-06, "loss": 0.0448, "num_input_tokens_seen": 35276288, "step": 35060 }, { "epoch": 16.532296086751533, "grad_norm": 0.31518062949180603, "learning_rate": 4.441867773004929e-06, "loss": 0.0691, "num_input_tokens_seen": 35280736, "step": 35065 }, { "epoch": 16.534653465346533, "grad_norm": 0.2038269191980362, "learning_rate": 4.4360165948070405e-06, "loss": 0.0823, "num_input_tokens_seen": 35285696, "step": 35070 }, { "epoch": 16.537010843941538, "grad_norm": 1.181052565574646, "learning_rate": 4.43016889774239e-06, "loss": 0.1149, "num_input_tokens_seen": 35291520, "step": 35075 }, { "epoch": 16.539368222536538, "grad_norm": 0.5626181364059448, "learning_rate": 4.424324682800887e-06, "loss": 0.1779, "num_input_tokens_seen": 35296384, "step": 35080 }, { "epoch": 16.541725601131542, "grad_norm": 1.2765167951583862, "learning_rate": 4.418483950971863e-06, "loss": 0.2035, "num_input_tokens_seen": 35301568, "step": 35085 }, { "epoch": 16.544082979726543, "grad_norm": 0.367676705121994, "learning_rate": 4.41264670324405e-06, "loss": 0.0533, "num_input_tokens_seen": 35306112, "step": 35090 }, { "epoch": 16.546440358321547, "grad_norm": 0.6122258901596069, "learning_rate": 4.406812940605604e-06, "loss": 0.0964, "num_input_tokens_seen": 35311424, "step": 35095 }, { "epoch": 16.548797736916548, "grad_norm": 0.25461211800575256, "learning_rate": 4.400982664044067e-06, "loss": 0.09, "num_input_tokens_seen": 35317184, "step": 35100 }, { "epoch": 16.551155115511552, "grad_norm": 0.6322311758995056, "learning_rate": 4.395155874546414e-06, "loss": 0.2567, "num_input_tokens_seen": 35321760, "step": 35105 }, { "epoch": 16.553512494106553, "grad_norm": 0.5592080950737, "learning_rate": 4.38933257309902e-06, "loss": 0.0729, "num_input_tokens_seen": 35328000, "step": 35110 }, { "epoch": 16.555869872701557, "grad_norm": 2.002451181411743, "learning_rate": 4.383512760687675e-06, "loss": 0.1527, "num_input_tokens_seen": 35332416, "step": 35115 }, { "epoch": 16.558227251296557, "grad_norm": 0.5652064085006714, "learning_rate": 4.377696438297568e-06, "loss": 0.0986, "num_input_tokens_seen": 35339488, "step": 35120 }, { "epoch": 16.56058462989156, "grad_norm": 0.22568970918655396, "learning_rate": 4.37188360691331e-06, "loss": 0.3709, "num_input_tokens_seen": 35344864, "step": 35125 }, { "epoch": 16.562942008486562, "grad_norm": 0.325350821018219, "learning_rate": 4.3660742675189105e-06, "loss": 0.0235, "num_input_tokens_seen": 35349344, "step": 35130 }, { "epoch": 16.565299387081566, "grad_norm": 0.6663213968276978, "learning_rate": 4.360268421097802e-06, "loss": 0.1547, "num_input_tokens_seen": 35353888, "step": 35135 }, { "epoch": 16.567656765676567, "grad_norm": 1.1587110757827759, "learning_rate": 4.3544660686327985e-06, "loss": 0.1906, "num_input_tokens_seen": 35358560, "step": 35140 }, { "epoch": 16.57001414427157, "grad_norm": 0.42824453115463257, "learning_rate": 4.348667211106147e-06, "loss": 0.215, "num_input_tokens_seen": 35363040, "step": 35145 }, { "epoch": 16.572371522866572, "grad_norm": 3.0219335556030273, "learning_rate": 4.3428718494995e-06, "loss": 0.0739, "num_input_tokens_seen": 35368192, "step": 35150 }, { "epoch": 16.574728901461576, "grad_norm": 3.413116216659546, "learning_rate": 4.337079984793909e-06, "loss": 0.2108, "num_input_tokens_seen": 35373376, "step": 35155 }, { "epoch": 16.577086280056577, "grad_norm": 0.04606839269399643, "learning_rate": 4.331291617969843e-06, "loss": 0.0149, "num_input_tokens_seen": 35377632, "step": 35160 }, { "epoch": 16.57944365865158, "grad_norm": 1.1066362857818604, "learning_rate": 4.325506750007169e-06, "loss": 0.1547, "num_input_tokens_seen": 35382752, "step": 35165 }, { "epoch": 16.58180103724658, "grad_norm": 0.6463743448257446, "learning_rate": 4.3197253818851706e-06, "loss": 0.1647, "num_input_tokens_seen": 35388128, "step": 35170 }, { "epoch": 16.584158415841586, "grad_norm": 0.0439281202852726, "learning_rate": 4.313947514582536e-06, "loss": 0.0338, "num_input_tokens_seen": 35393536, "step": 35175 }, { "epoch": 16.586515794436586, "grad_norm": 0.07126808166503906, "learning_rate": 4.308173149077357e-06, "loss": 0.0355, "num_input_tokens_seen": 35397376, "step": 35180 }, { "epoch": 16.58887317303159, "grad_norm": 0.8185539841651917, "learning_rate": 4.302402286347135e-06, "loss": 0.0738, "num_input_tokens_seen": 35402272, "step": 35185 }, { "epoch": 16.59123055162659, "grad_norm": 0.014816485345363617, "learning_rate": 4.2966349273687785e-06, "loss": 0.1167, "num_input_tokens_seen": 35407104, "step": 35190 }, { "epoch": 16.593587930221595, "grad_norm": 0.02615090273320675, "learning_rate": 4.290871073118608e-06, "loss": 0.0363, "num_input_tokens_seen": 35411200, "step": 35195 }, { "epoch": 16.595945308816596, "grad_norm": 0.6046074032783508, "learning_rate": 4.285110724572345e-06, "loss": 0.0461, "num_input_tokens_seen": 35416448, "step": 35200 }, { "epoch": 16.5983026874116, "grad_norm": 0.16763505339622498, "learning_rate": 4.2793538827051145e-06, "loss": 0.1611, "num_input_tokens_seen": 35421472, "step": 35205 }, { "epoch": 16.6006600660066, "grad_norm": 0.7763628363609314, "learning_rate": 4.273600548491458e-06, "loss": 0.0362, "num_input_tokens_seen": 35425504, "step": 35210 }, { "epoch": 16.603017444601605, "grad_norm": 0.3274820148944855, "learning_rate": 4.267850722905317e-06, "loss": 0.039, "num_input_tokens_seen": 35431200, "step": 35215 }, { "epoch": 16.605374823196605, "grad_norm": 0.14470802247524261, "learning_rate": 4.262104406920034e-06, "loss": 0.022, "num_input_tokens_seen": 35435936, "step": 35220 }, { "epoch": 16.607732201791606, "grad_norm": 1.1370147466659546, "learning_rate": 4.256361601508363e-06, "loss": 0.1594, "num_input_tokens_seen": 35441056, "step": 35225 }, { "epoch": 16.61008958038661, "grad_norm": 0.2143060266971588, "learning_rate": 4.250622307642469e-06, "loss": 0.1006, "num_input_tokens_seen": 35447328, "step": 35230 }, { "epoch": 16.61244695898161, "grad_norm": 1.2925293445587158, "learning_rate": 4.244886526293909e-06, "loss": 0.0965, "num_input_tokens_seen": 35452320, "step": 35235 }, { "epoch": 16.614804337576615, "grad_norm": 1.953286051750183, "learning_rate": 4.239154258433661e-06, "loss": 0.0701, "num_input_tokens_seen": 35457472, "step": 35240 }, { "epoch": 16.617161716171616, "grad_norm": 0.029267283156514168, "learning_rate": 4.233425505032099e-06, "loss": 0.0442, "num_input_tokens_seen": 35462016, "step": 35245 }, { "epoch": 16.61951909476662, "grad_norm": 0.11645568162202835, "learning_rate": 4.227700267059004e-06, "loss": 0.1225, "num_input_tokens_seen": 35468544, "step": 35250 }, { "epoch": 16.62187647336162, "grad_norm": 0.10353265702724457, "learning_rate": 4.221978545483563e-06, "loss": 0.0834, "num_input_tokens_seen": 35473568, "step": 35255 }, { "epoch": 16.624233851956625, "grad_norm": 0.16840244829654694, "learning_rate": 4.216260341274359e-06, "loss": 0.2451, "num_input_tokens_seen": 35479424, "step": 35260 }, { "epoch": 16.626591230551625, "grad_norm": 0.9907161593437195, "learning_rate": 4.2105456553993895e-06, "loss": 0.066, "num_input_tokens_seen": 35484352, "step": 35265 }, { "epoch": 16.62894860914663, "grad_norm": 1.2741934061050415, "learning_rate": 4.204834488826059e-06, "loss": 0.1592, "num_input_tokens_seen": 35489024, "step": 35270 }, { "epoch": 16.63130598774163, "grad_norm": 0.17547093331813812, "learning_rate": 4.199126842521164e-06, "loss": 0.1471, "num_input_tokens_seen": 35493504, "step": 35275 }, { "epoch": 16.633663366336634, "grad_norm": 0.314756840467453, "learning_rate": 4.19342271745092e-06, "loss": 0.0349, "num_input_tokens_seen": 35498208, "step": 35280 }, { "epoch": 16.636020744931635, "grad_norm": 2.289655923843384, "learning_rate": 4.18772211458093e-06, "loss": 0.0495, "num_input_tokens_seen": 35502912, "step": 35285 }, { "epoch": 16.63837812352664, "grad_norm": 0.25927120447158813, "learning_rate": 4.182025034876225e-06, "loss": 0.0877, "num_input_tokens_seen": 35507808, "step": 35290 }, { "epoch": 16.64073550212164, "grad_norm": 0.6356338858604431, "learning_rate": 4.176331479301201e-06, "loss": 0.0508, "num_input_tokens_seen": 35512000, "step": 35295 }, { "epoch": 16.643092880716644, "grad_norm": 1.0314236879348755, "learning_rate": 4.170641448819701e-06, "loss": 0.0911, "num_input_tokens_seen": 35516448, "step": 35300 }, { "epoch": 16.645450259311644, "grad_norm": 0.12015064805746078, "learning_rate": 4.164954944394933e-06, "loss": 0.1017, "num_input_tokens_seen": 35522112, "step": 35305 }, { "epoch": 16.64780763790665, "grad_norm": 0.041973140090703964, "learning_rate": 4.159271966989536e-06, "loss": 0.0281, "num_input_tokens_seen": 35527840, "step": 35310 }, { "epoch": 16.65016501650165, "grad_norm": 0.13984297215938568, "learning_rate": 4.153592517565541e-06, "loss": 0.0373, "num_input_tokens_seen": 35532704, "step": 35315 }, { "epoch": 16.652522395096653, "grad_norm": 0.4118684232234955, "learning_rate": 4.147916597084378e-06, "loss": 0.0568, "num_input_tokens_seen": 35538144, "step": 35320 }, { "epoch": 16.654879773691654, "grad_norm": 1.050197720527649, "learning_rate": 4.142244206506887e-06, "loss": 0.1638, "num_input_tokens_seen": 35543392, "step": 35325 }, { "epoch": 16.65723715228666, "grad_norm": 0.053317680954933167, "learning_rate": 4.136575346793309e-06, "loss": 0.0482, "num_input_tokens_seen": 35547872, "step": 35330 }, { "epoch": 16.65959453088166, "grad_norm": 2.3660826683044434, "learning_rate": 4.130910018903284e-06, "loss": 0.2771, "num_input_tokens_seen": 35553120, "step": 35335 }, { "epoch": 16.661951909476663, "grad_norm": 1.0859018564224243, "learning_rate": 4.12524822379586e-06, "loss": 0.1385, "num_input_tokens_seen": 35558336, "step": 35340 }, { "epoch": 16.664309288071664, "grad_norm": 0.05067949369549751, "learning_rate": 4.119589962429474e-06, "loss": 0.1251, "num_input_tokens_seen": 35562752, "step": 35345 }, { "epoch": 16.666666666666668, "grad_norm": 0.8074644804000854, "learning_rate": 4.113935235761979e-06, "loss": 0.1296, "num_input_tokens_seen": 35567040, "step": 35350 }, { "epoch": 16.66902404526167, "grad_norm": 0.43354448676109314, "learning_rate": 4.108284044750621e-06, "loss": 0.0456, "num_input_tokens_seen": 35572256, "step": 35355 }, { "epoch": 16.671381423856673, "grad_norm": 0.23012152314186096, "learning_rate": 4.102636390352054e-06, "loss": 0.0451, "num_input_tokens_seen": 35576160, "step": 35360 }, { "epoch": 16.673738802451673, "grad_norm": 0.26664838194847107, "learning_rate": 4.096992273522329e-06, "loss": 0.1257, "num_input_tokens_seen": 35580320, "step": 35365 }, { "epoch": 16.676096181046677, "grad_norm": 0.15573228895664215, "learning_rate": 4.0913516952169e-06, "loss": 0.0827, "num_input_tokens_seen": 35585024, "step": 35370 }, { "epoch": 16.678453559641678, "grad_norm": 0.03846706077456474, "learning_rate": 4.085714656390618e-06, "loss": 0.0356, "num_input_tokens_seen": 35589536, "step": 35375 }, { "epoch": 16.680810938236682, "grad_norm": 0.8929927945137024, "learning_rate": 4.080081157997751e-06, "loss": 0.2069, "num_input_tokens_seen": 35593536, "step": 35380 }, { "epoch": 16.683168316831683, "grad_norm": 0.14954985678195953, "learning_rate": 4.074451200991933e-06, "loss": 0.1368, "num_input_tokens_seen": 35597664, "step": 35385 }, { "epoch": 16.685525695426687, "grad_norm": 0.47146904468536377, "learning_rate": 4.068824786326234e-06, "loss": 0.0968, "num_input_tokens_seen": 35602976, "step": 35390 }, { "epoch": 16.687883074021688, "grad_norm": 0.3825130760669708, "learning_rate": 4.063201914953107e-06, "loss": 0.0258, "num_input_tokens_seen": 35607264, "step": 35395 }, { "epoch": 16.690240452616692, "grad_norm": 2.131826400756836, "learning_rate": 4.05758258782441e-06, "loss": 0.0594, "num_input_tokens_seen": 35612512, "step": 35400 }, { "epoch": 16.692597831211692, "grad_norm": 0.4277225434780121, "learning_rate": 4.051966805891396e-06, "loss": 0.0651, "num_input_tokens_seen": 35617376, "step": 35405 }, { "epoch": 16.694955209806697, "grad_norm": 2.6035289764404297, "learning_rate": 4.046354570104726e-06, "loss": 0.1487, "num_input_tokens_seen": 35622176, "step": 35410 }, { "epoch": 16.697312588401697, "grad_norm": 1.4776582717895508, "learning_rate": 4.040745881414457e-06, "loss": 0.0853, "num_input_tokens_seen": 35626496, "step": 35415 }, { "epoch": 16.6996699669967, "grad_norm": 0.6071659922599792, "learning_rate": 4.035140740770047e-06, "loss": 0.0292, "num_input_tokens_seen": 35632256, "step": 35420 }, { "epoch": 16.702027345591702, "grad_norm": 0.09072701632976532, "learning_rate": 4.029539149120337e-06, "loss": 0.1202, "num_input_tokens_seen": 35636928, "step": 35425 }, { "epoch": 16.704384724186703, "grad_norm": 0.11138325184583664, "learning_rate": 4.023941107413595e-06, "loss": 0.0608, "num_input_tokens_seen": 35641440, "step": 35430 }, { "epoch": 16.706742102781707, "grad_norm": 0.7184771299362183, "learning_rate": 4.018346616597468e-06, "loss": 0.1729, "num_input_tokens_seen": 35647424, "step": 35435 }, { "epoch": 16.709099481376708, "grad_norm": 0.2836416959762573, "learning_rate": 4.0127556776190125e-06, "loss": 0.0259, "num_input_tokens_seen": 35652640, "step": 35440 }, { "epoch": 16.71145685997171, "grad_norm": 1.6716159582138062, "learning_rate": 4.007168291424676e-06, "loss": 0.2169, "num_input_tokens_seen": 35657632, "step": 35445 }, { "epoch": 16.713814238566712, "grad_norm": 0.22639396786689758, "learning_rate": 4.001584458960311e-06, "loss": 0.0533, "num_input_tokens_seen": 35662336, "step": 35450 }, { "epoch": 16.716171617161717, "grad_norm": 1.4614547491073608, "learning_rate": 3.996004181171164e-06, "loss": 0.1104, "num_input_tokens_seen": 35666144, "step": 35455 }, { "epoch": 16.718528995756717, "grad_norm": 1.3249346017837524, "learning_rate": 3.99042745900188e-06, "loss": 0.1501, "num_input_tokens_seen": 35671008, "step": 35460 }, { "epoch": 16.72088637435172, "grad_norm": 0.12295056134462357, "learning_rate": 3.984854293396515e-06, "loss": 0.0792, "num_input_tokens_seen": 35675424, "step": 35465 }, { "epoch": 16.723243752946722, "grad_norm": 1.0691351890563965, "learning_rate": 3.979284685298493e-06, "loss": 0.1291, "num_input_tokens_seen": 35680448, "step": 35470 }, { "epoch": 16.725601131541726, "grad_norm": 0.12913444638252258, "learning_rate": 3.973718635650661e-06, "loss": 0.1016, "num_input_tokens_seen": 35686688, "step": 35475 }, { "epoch": 16.727958510136727, "grad_norm": 0.11905939877033234, "learning_rate": 3.968156145395258e-06, "loss": 0.0448, "num_input_tokens_seen": 35691648, "step": 35480 }, { "epoch": 16.73031588873173, "grad_norm": 0.16648589074611664, "learning_rate": 3.962597215473921e-06, "loss": 0.211, "num_input_tokens_seen": 35696480, "step": 35485 }, { "epoch": 16.73267326732673, "grad_norm": 0.8234895467758179, "learning_rate": 3.957041846827681e-06, "loss": 0.1458, "num_input_tokens_seen": 35700960, "step": 35490 }, { "epoch": 16.735030645921736, "grad_norm": 0.07527393102645874, "learning_rate": 3.951490040396966e-06, "loss": 0.0798, "num_input_tokens_seen": 35706016, "step": 35495 }, { "epoch": 16.737388024516736, "grad_norm": 0.05360841751098633, "learning_rate": 3.945941797121602e-06, "loss": 0.183, "num_input_tokens_seen": 35710880, "step": 35500 }, { "epoch": 16.73974540311174, "grad_norm": 0.020098907873034477, "learning_rate": 3.940397117940822e-06, "loss": 0.0448, "num_input_tokens_seen": 35715456, "step": 35505 }, { "epoch": 16.74210278170674, "grad_norm": 0.3169959485530853, "learning_rate": 3.934856003793233e-06, "loss": 0.1305, "num_input_tokens_seen": 35720320, "step": 35510 }, { "epoch": 16.744460160301745, "grad_norm": 2.571486234664917, "learning_rate": 3.929318455616862e-06, "loss": 0.2281, "num_input_tokens_seen": 35726240, "step": 35515 }, { "epoch": 16.746817538896746, "grad_norm": 0.28258877992630005, "learning_rate": 3.923784474349113e-06, "loss": 0.0347, "num_input_tokens_seen": 35732864, "step": 35520 }, { "epoch": 16.74917491749175, "grad_norm": 0.10532199591398239, "learning_rate": 3.9182540609267955e-06, "loss": 0.0333, "num_input_tokens_seen": 35738400, "step": 35525 }, { "epoch": 16.75153229608675, "grad_norm": 1.2072490453720093, "learning_rate": 3.912727216286119e-06, "loss": 0.2052, "num_input_tokens_seen": 35743168, "step": 35530 }, { "epoch": 16.753889674681755, "grad_norm": 0.21958228945732117, "learning_rate": 3.907203941362683e-06, "loss": 0.0405, "num_input_tokens_seen": 35748192, "step": 35535 }, { "epoch": 16.756247053276756, "grad_norm": 0.031719647347927094, "learning_rate": 3.901684237091485e-06, "loss": 0.0562, "num_input_tokens_seen": 35753824, "step": 35540 }, { "epoch": 16.75860443187176, "grad_norm": 0.41051843762397766, "learning_rate": 3.896168104406917e-06, "loss": 0.0613, "num_input_tokens_seen": 35758656, "step": 35545 }, { "epoch": 16.76096181046676, "grad_norm": 0.7824527025222778, "learning_rate": 3.8906555442427685e-06, "loss": 0.0338, "num_input_tokens_seen": 35763136, "step": 35550 }, { "epoch": 16.763319189061765, "grad_norm": 0.042012616991996765, "learning_rate": 3.885146557532218e-06, "loss": 0.0409, "num_input_tokens_seen": 35767424, "step": 35555 }, { "epoch": 16.765676567656765, "grad_norm": 0.15236109495162964, "learning_rate": 3.879641145207846e-06, "loss": 0.0915, "num_input_tokens_seen": 35772320, "step": 35560 }, { "epoch": 16.76803394625177, "grad_norm": 0.6186798810958862, "learning_rate": 3.8741393082016215e-06, "loss": 0.0449, "num_input_tokens_seen": 35776576, "step": 35565 }, { "epoch": 16.77039132484677, "grad_norm": 0.33555135130882263, "learning_rate": 3.868641047444918e-06, "loss": 0.032, "num_input_tokens_seen": 35781184, "step": 35570 }, { "epoch": 16.772748703441774, "grad_norm": 0.4102511703968048, "learning_rate": 3.863146363868494e-06, "loss": 0.0805, "num_input_tokens_seen": 35786944, "step": 35575 }, { "epoch": 16.775106082036775, "grad_norm": 0.44855841994285583, "learning_rate": 3.85765525840251e-06, "loss": 0.0174, "num_input_tokens_seen": 35791904, "step": 35580 }, { "epoch": 16.77746346063178, "grad_norm": 1.1412113904953003, "learning_rate": 3.852167731976511e-06, "loss": 0.2006, "num_input_tokens_seen": 35797408, "step": 35585 }, { "epoch": 16.77982083922678, "grad_norm": 1.308578610420227, "learning_rate": 3.8466837855194505e-06, "loss": 0.1325, "num_input_tokens_seen": 35803264, "step": 35590 }, { "epoch": 16.782178217821784, "grad_norm": 1.943511724472046, "learning_rate": 3.841203419959657e-06, "loss": 0.0993, "num_input_tokens_seen": 35809024, "step": 35595 }, { "epoch": 16.784535596416784, "grad_norm": 0.02567986398935318, "learning_rate": 3.83572663622487e-06, "loss": 0.0109, "num_input_tokens_seen": 35814496, "step": 35600 }, { "epoch": 16.78689297501179, "grad_norm": 0.8073160648345947, "learning_rate": 3.830253435242215e-06, "loss": 0.0518, "num_input_tokens_seen": 35818944, "step": 35605 }, { "epoch": 16.78925035360679, "grad_norm": 0.2674635648727417, "learning_rate": 3.82478381793821e-06, "loss": 0.2707, "num_input_tokens_seen": 35823648, "step": 35610 }, { "epoch": 16.79160773220179, "grad_norm": 0.05678345263004303, "learning_rate": 3.8193177852387714e-06, "loss": 0.0623, "num_input_tokens_seen": 35830976, "step": 35615 }, { "epoch": 16.793965110796794, "grad_norm": 0.3363112211227417, "learning_rate": 3.8138553380692026e-06, "loss": 0.1557, "num_input_tokens_seen": 35836320, "step": 35620 }, { "epoch": 16.796322489391795, "grad_norm": 0.048201024532318115, "learning_rate": 3.8083964773542064e-06, "loss": 0.0199, "num_input_tokens_seen": 35841056, "step": 35625 }, { "epoch": 16.7986798679868, "grad_norm": 0.11286874860525131, "learning_rate": 3.802941204017879e-06, "loss": 0.0301, "num_input_tokens_seen": 35846208, "step": 35630 }, { "epoch": 16.8010372465818, "grad_norm": 0.4049970209598541, "learning_rate": 3.7974895189836936e-06, "loss": 0.0571, "num_input_tokens_seen": 35851104, "step": 35635 }, { "epoch": 16.803394625176804, "grad_norm": 0.057308025658130646, "learning_rate": 3.792041423174536e-06, "loss": 0.0758, "num_input_tokens_seen": 35855264, "step": 35640 }, { "epoch": 16.805752003771804, "grad_norm": 1.805928349494934, "learning_rate": 3.786596917512672e-06, "loss": 0.0942, "num_input_tokens_seen": 35860288, "step": 35645 }, { "epoch": 16.80810938236681, "grad_norm": 0.1441596895456314, "learning_rate": 3.7811560029197695e-06, "loss": 0.1153, "num_input_tokens_seen": 35865056, "step": 35650 }, { "epoch": 16.81046676096181, "grad_norm": 0.10322435945272446, "learning_rate": 3.775718680316878e-06, "loss": 0.0668, "num_input_tokens_seen": 35870400, "step": 35655 }, { "epoch": 16.812824139556813, "grad_norm": 0.22154198586940765, "learning_rate": 3.7702849506244475e-06, "loss": 0.1941, "num_input_tokens_seen": 35875840, "step": 35660 }, { "epoch": 16.815181518151814, "grad_norm": 2.10430645942688, "learning_rate": 3.7648548147623143e-06, "loss": 0.1627, "num_input_tokens_seen": 35880704, "step": 35665 }, { "epoch": 16.817538896746818, "grad_norm": 0.006388847716152668, "learning_rate": 3.759428273649715e-06, "loss": 0.0832, "num_input_tokens_seen": 35885312, "step": 35670 }, { "epoch": 16.81989627534182, "grad_norm": 0.1728712022304535, "learning_rate": 3.7540053282052596e-06, "loss": 0.0741, "num_input_tokens_seen": 35890688, "step": 35675 }, { "epoch": 16.822253653936823, "grad_norm": 0.2373122274875641, "learning_rate": 3.7485859793469634e-06, "loss": 0.076, "num_input_tokens_seen": 35894912, "step": 35680 }, { "epoch": 16.824611032531823, "grad_norm": 0.12481541931629181, "learning_rate": 3.743170227992235e-06, "loss": 0.1174, "num_input_tokens_seen": 35900192, "step": 35685 }, { "epoch": 16.826968411126828, "grad_norm": 0.08202973753213882, "learning_rate": 3.737758075057865e-06, "loss": 0.1015, "num_input_tokens_seen": 35904704, "step": 35690 }, { "epoch": 16.82932578972183, "grad_norm": 0.9531524777412415, "learning_rate": 3.7323495214600407e-06, "loss": 0.053, "num_input_tokens_seen": 35910240, "step": 35695 }, { "epoch": 16.831683168316832, "grad_norm": 0.3570767045021057, "learning_rate": 3.7269445681143407e-06, "loss": 0.1371, "num_input_tokens_seen": 35915072, "step": 35700 }, { "epoch": 16.834040546911833, "grad_norm": 1.8883479833602905, "learning_rate": 3.721543215935727e-06, "loss": 0.096, "num_input_tokens_seen": 35919616, "step": 35705 }, { "epoch": 16.836397925506837, "grad_norm": 2.1583101749420166, "learning_rate": 3.7161454658385634e-06, "loss": 0.1614, "num_input_tokens_seen": 35924416, "step": 35710 }, { "epoch": 16.838755304101838, "grad_norm": 1.2368730306625366, "learning_rate": 3.710751318736591e-06, "loss": 0.1958, "num_input_tokens_seen": 35930304, "step": 35715 }, { "epoch": 16.841112682696842, "grad_norm": 0.1779518872499466, "learning_rate": 3.705360775542946e-06, "loss": 0.1101, "num_input_tokens_seen": 35935104, "step": 35720 }, { "epoch": 16.843470061291843, "grad_norm": 0.2902977764606476, "learning_rate": 3.699973837170162e-06, "loss": 0.205, "num_input_tokens_seen": 35940800, "step": 35725 }, { "epoch": 16.845827439886847, "grad_norm": 1.2201416492462158, "learning_rate": 3.6945905045301516e-06, "loss": 0.0875, "num_input_tokens_seen": 35946400, "step": 35730 }, { "epoch": 16.848184818481847, "grad_norm": 1.5331659317016602, "learning_rate": 3.6892107785342246e-06, "loss": 0.0553, "num_input_tokens_seen": 35951456, "step": 35735 }, { "epoch": 16.85054219707685, "grad_norm": 1.3719795942306519, "learning_rate": 3.6838346600930827e-06, "loss": 0.0766, "num_input_tokens_seen": 35955968, "step": 35740 }, { "epoch": 16.852899575671852, "grad_norm": 2.965559244155884, "learning_rate": 3.678462150116799e-06, "loss": 0.1232, "num_input_tokens_seen": 35961344, "step": 35745 }, { "epoch": 16.855256954266856, "grad_norm": 0.025795549154281616, "learning_rate": 3.673093249514853e-06, "loss": 0.0166, "num_input_tokens_seen": 35965824, "step": 35750 }, { "epoch": 16.857614332861857, "grad_norm": 0.6136384606361389, "learning_rate": 3.66772795919611e-06, "loss": 0.2354, "num_input_tokens_seen": 35969920, "step": 35755 }, { "epoch": 16.85997171145686, "grad_norm": 0.27202850580215454, "learning_rate": 3.662366280068827e-06, "loss": 0.1553, "num_input_tokens_seen": 35975008, "step": 35760 }, { "epoch": 16.862329090051862, "grad_norm": 0.024632528424263, "learning_rate": 3.657008213040636e-06, "loss": 0.0437, "num_input_tokens_seen": 35980640, "step": 35765 }, { "epoch": 16.864686468646866, "grad_norm": 0.8033923506736755, "learning_rate": 3.6516537590185697e-06, "loss": 0.2136, "num_input_tokens_seen": 35985824, "step": 35770 }, { "epoch": 16.867043847241867, "grad_norm": 0.08698929101228714, "learning_rate": 3.64630291890905e-06, "loss": 0.0176, "num_input_tokens_seen": 35990784, "step": 35775 }, { "epoch": 16.86940122583687, "grad_norm": 1.5769704580307007, "learning_rate": 3.6409556936178784e-06, "loss": 0.2195, "num_input_tokens_seen": 35995264, "step": 35780 }, { "epoch": 16.87175860443187, "grad_norm": 0.21159736812114716, "learning_rate": 3.635612084050255e-06, "loss": 0.0642, "num_input_tokens_seen": 35999264, "step": 35785 }, { "epoch": 16.874115983026876, "grad_norm": 1.335578203201294, "learning_rate": 3.630272091110756e-06, "loss": 0.0548, "num_input_tokens_seen": 36003968, "step": 35790 }, { "epoch": 16.876473361621876, "grad_norm": 2.9683477878570557, "learning_rate": 3.6249357157033565e-06, "loss": 0.2334, "num_input_tokens_seen": 36008576, "step": 35795 }, { "epoch": 16.87883074021688, "grad_norm": 0.15687644481658936, "learning_rate": 3.619602958731419e-06, "loss": 0.0289, "num_input_tokens_seen": 36014464, "step": 35800 }, { "epoch": 16.88118811881188, "grad_norm": 1.3307311534881592, "learning_rate": 3.6142738210976744e-06, "loss": 0.0998, "num_input_tokens_seen": 36018496, "step": 35805 }, { "epoch": 16.883545497406885, "grad_norm": 0.01766004040837288, "learning_rate": 3.6089483037042624e-06, "loss": 0.0247, "num_input_tokens_seen": 36022624, "step": 35810 }, { "epoch": 16.885902876001886, "grad_norm": 1.3049066066741943, "learning_rate": 3.603626407452704e-06, "loss": 0.1349, "num_input_tokens_seen": 36028992, "step": 35815 }, { "epoch": 16.888260254596887, "grad_norm": 0.26414406299591064, "learning_rate": 3.598308133243908e-06, "loss": 0.0394, "num_input_tokens_seen": 36033472, "step": 35820 }, { "epoch": 16.89061763319189, "grad_norm": 0.05278434976935387, "learning_rate": 3.59299348197816e-06, "loss": 0.12, "num_input_tokens_seen": 36039264, "step": 35825 }, { "epoch": 16.89297501178689, "grad_norm": 0.8960198163986206, "learning_rate": 3.5876824545551497e-06, "loss": 0.1407, "num_input_tokens_seen": 36045696, "step": 35830 }, { "epoch": 16.895332390381895, "grad_norm": 0.12390091270208359, "learning_rate": 3.5823750518739373e-06, "loss": 0.0169, "num_input_tokens_seen": 36050272, "step": 35835 }, { "epoch": 16.897689768976896, "grad_norm": 1.7089635133743286, "learning_rate": 3.5770712748329826e-06, "loss": 0.0853, "num_input_tokens_seen": 36055232, "step": 35840 }, { "epoch": 16.9000471475719, "grad_norm": 0.8527228236198425, "learning_rate": 3.5717711243301172e-06, "loss": 0.135, "num_input_tokens_seen": 36061376, "step": 35845 }, { "epoch": 16.9024045261669, "grad_norm": 1.4946452379226685, "learning_rate": 3.5664746012625677e-06, "loss": 0.2309, "num_input_tokens_seen": 36066816, "step": 35850 }, { "epoch": 16.904761904761905, "grad_norm": 0.6384212970733643, "learning_rate": 3.5611817065269475e-06, "loss": 0.0737, "num_input_tokens_seen": 36071744, "step": 35855 }, { "epoch": 16.907119283356906, "grad_norm": 0.03171129152178764, "learning_rate": 3.5558924410192566e-06, "loss": 0.1456, "num_input_tokens_seen": 36075968, "step": 35860 }, { "epoch": 16.90947666195191, "grad_norm": 0.20722617208957672, "learning_rate": 3.5506068056348736e-06, "loss": 0.0314, "num_input_tokens_seen": 36081056, "step": 35865 }, { "epoch": 16.91183404054691, "grad_norm": 0.05561977997422218, "learning_rate": 3.545324801268568e-06, "loss": 0.0752, "num_input_tokens_seen": 36085536, "step": 35870 }, { "epoch": 16.914191419141915, "grad_norm": 0.10591956973075867, "learning_rate": 3.540046428814495e-06, "loss": 0.116, "num_input_tokens_seen": 36090720, "step": 35875 }, { "epoch": 16.916548797736915, "grad_norm": 0.4629667103290558, "learning_rate": 3.534771689166197e-06, "loss": 0.1252, "num_input_tokens_seen": 36094848, "step": 35880 }, { "epoch": 16.91890617633192, "grad_norm": 1.397885799407959, "learning_rate": 3.5295005832165884e-06, "loss": 0.3096, "num_input_tokens_seen": 36100160, "step": 35885 }, { "epoch": 16.92126355492692, "grad_norm": 1.6620848178863525, "learning_rate": 3.52423311185798e-06, "loss": 0.0434, "num_input_tokens_seen": 36106496, "step": 35890 }, { "epoch": 16.923620933521924, "grad_norm": 1.2117187976837158, "learning_rate": 3.5189692759820705e-06, "loss": 0.2507, "num_input_tokens_seen": 36111552, "step": 35895 }, { "epoch": 16.925978312116925, "grad_norm": 0.7653892040252686, "learning_rate": 3.5137090764799376e-06, "loss": 0.0537, "num_input_tokens_seen": 36116416, "step": 35900 }, { "epoch": 16.92833569071193, "grad_norm": 0.26801636815071106, "learning_rate": 3.5084525142420387e-06, "loss": 0.1372, "num_input_tokens_seen": 36121920, "step": 35905 }, { "epoch": 16.93069306930693, "grad_norm": 0.7881730794906616, "learning_rate": 3.503199590158224e-06, "loss": 0.063, "num_input_tokens_seen": 36126560, "step": 35910 }, { "epoch": 16.933050447901934, "grad_norm": 0.954332709312439, "learning_rate": 3.4979503051177226e-06, "loss": 0.0662, "num_input_tokens_seen": 36131072, "step": 35915 }, { "epoch": 16.935407826496935, "grad_norm": 0.09644432365894318, "learning_rate": 3.4927046600091577e-06, "loss": 0.1085, "num_input_tokens_seen": 36136320, "step": 35920 }, { "epoch": 16.93776520509194, "grad_norm": 1.4667813777923584, "learning_rate": 3.4874626557205136e-06, "loss": 0.1661, "num_input_tokens_seen": 36140992, "step": 35925 }, { "epoch": 16.94012258368694, "grad_norm": 0.13774374127388, "learning_rate": 3.482224293139183e-06, "loss": 0.0383, "num_input_tokens_seen": 36146080, "step": 35930 }, { "epoch": 16.942479962281944, "grad_norm": 0.43189388513565063, "learning_rate": 3.476989573151926e-06, "loss": 0.0248, "num_input_tokens_seen": 36151712, "step": 35935 }, { "epoch": 16.944837340876944, "grad_norm": 0.7741876840591431, "learning_rate": 3.4717584966448946e-06, "loss": 0.2784, "num_input_tokens_seen": 36155968, "step": 35940 }, { "epoch": 16.94719471947195, "grad_norm": 1.1204323768615723, "learning_rate": 3.4665310645036197e-06, "loss": 0.097, "num_input_tokens_seen": 36160608, "step": 35945 }, { "epoch": 16.94955209806695, "grad_norm": 1.716801643371582, "learning_rate": 3.4613072776130213e-06, "loss": 0.143, "num_input_tokens_seen": 36165984, "step": 35950 }, { "epoch": 16.951909476661953, "grad_norm": 0.10408607125282288, "learning_rate": 3.4560871368573915e-06, "loss": 0.0559, "num_input_tokens_seen": 36170848, "step": 35955 }, { "epoch": 16.954266855256954, "grad_norm": 1.8137445449829102, "learning_rate": 3.450870643120421e-06, "loss": 0.0395, "num_input_tokens_seen": 36175360, "step": 35960 }, { "epoch": 16.956624233851958, "grad_norm": 0.9273239374160767, "learning_rate": 3.445657797285162e-06, "loss": 0.0419, "num_input_tokens_seen": 36179584, "step": 35965 }, { "epoch": 16.95898161244696, "grad_norm": 0.00690497038885951, "learning_rate": 3.4404486002340723e-06, "loss": 0.0877, "num_input_tokens_seen": 36184800, "step": 35970 }, { "epoch": 16.961338991041963, "grad_norm": 0.5781744122505188, "learning_rate": 3.435243052848969e-06, "loss": 0.0269, "num_input_tokens_seen": 36189056, "step": 35975 }, { "epoch": 16.963696369636963, "grad_norm": 0.33713239431381226, "learning_rate": 3.4300411560110656e-06, "loss": 0.1136, "num_input_tokens_seen": 36193344, "step": 35980 }, { "epoch": 16.966053748231968, "grad_norm": 0.02843749150633812, "learning_rate": 3.4248429106009616e-06, "loss": 0.0347, "num_input_tokens_seen": 36198240, "step": 35985 }, { "epoch": 16.968411126826968, "grad_norm": 2.311213493347168, "learning_rate": 3.419648317498625e-06, "loss": 0.1005, "num_input_tokens_seen": 36202656, "step": 35990 }, { "epoch": 16.970768505421972, "grad_norm": 0.170911505818367, "learning_rate": 3.4144573775834134e-06, "loss": 0.0511, "num_input_tokens_seen": 36208064, "step": 35995 }, { "epoch": 16.973125884016973, "grad_norm": 2.2905006408691406, "learning_rate": 3.409270091734068e-06, "loss": 0.1634, "num_input_tokens_seen": 36212352, "step": 36000 }, { "epoch": 16.975483262611977, "grad_norm": 0.011061081662774086, "learning_rate": 3.4040864608287067e-06, "loss": 0.1399, "num_input_tokens_seen": 36217184, "step": 36005 }, { "epoch": 16.977840641206978, "grad_norm": 0.09531980007886887, "learning_rate": 3.398906485744835e-06, "loss": 0.1385, "num_input_tokens_seen": 36222400, "step": 36010 }, { "epoch": 16.980198019801982, "grad_norm": 0.17789842188358307, "learning_rate": 3.393730167359321e-06, "loss": 0.0208, "num_input_tokens_seen": 36227264, "step": 36015 }, { "epoch": 16.982555398396983, "grad_norm": 0.09812960773706436, "learning_rate": 3.3885575065484397e-06, "loss": 0.1717, "num_input_tokens_seen": 36232864, "step": 36020 }, { "epoch": 16.984912776991983, "grad_norm": 0.058460816740989685, "learning_rate": 3.383388504187829e-06, "loss": 0.1768, "num_input_tokens_seen": 36237664, "step": 36025 }, { "epoch": 16.987270155586987, "grad_norm": 0.0210768710821867, "learning_rate": 3.378223161152519e-06, "loss": 0.063, "num_input_tokens_seen": 36242752, "step": 36030 }, { "epoch": 16.989627534181988, "grad_norm": 0.05557927116751671, "learning_rate": 3.373061478316908e-06, "loss": 0.1454, "num_input_tokens_seen": 36247872, "step": 36035 }, { "epoch": 16.991984912776992, "grad_norm": 0.4434499740600586, "learning_rate": 3.367903456554783e-06, "loss": 0.0221, "num_input_tokens_seen": 36252800, "step": 36040 }, { "epoch": 16.994342291371993, "grad_norm": 0.20277275145053864, "learning_rate": 3.362749096739312e-06, "loss": 0.0391, "num_input_tokens_seen": 36257664, "step": 36045 }, { "epoch": 16.996699669966997, "grad_norm": 0.15382593870162964, "learning_rate": 3.357598399743045e-06, "loss": 0.1019, "num_input_tokens_seen": 36262112, "step": 36050 }, { "epoch": 16.999057048561998, "grad_norm": 0.030043693259358406, "learning_rate": 3.3524513664378975e-06, "loss": 0.0424, "num_input_tokens_seen": 36267328, "step": 36055 }, { "epoch": 17.0, "eval_loss": 0.1527273803949356, "eval_runtime": 15.1095, "eval_samples_per_second": 62.411, "eval_steps_per_second": 15.619, "num_input_tokens_seen": 36269152, "step": 36057 }, { "epoch": 17.001414427157002, "grad_norm": 0.8388797640800476, "learning_rate": 3.347307997695179e-06, "loss": 0.0381, "num_input_tokens_seen": 36272320, "step": 36060 }, { "epoch": 17.003771805752002, "grad_norm": 0.1600039005279541, "learning_rate": 3.3421682943855727e-06, "loss": 0.0891, "num_input_tokens_seen": 36276640, "step": 36065 }, { "epoch": 17.006129184347007, "grad_norm": 1.4715557098388672, "learning_rate": 3.3370322573791468e-06, "loss": 0.0952, "num_input_tokens_seen": 36281216, "step": 36070 }, { "epoch": 17.008486562942007, "grad_norm": 0.2588866949081421, "learning_rate": 3.331899887545342e-06, "loss": 0.0214, "num_input_tokens_seen": 36285952, "step": 36075 }, { "epoch": 17.01084394153701, "grad_norm": 0.06907765567302704, "learning_rate": 3.326771185752986e-06, "loss": 0.0962, "num_input_tokens_seen": 36290528, "step": 36080 }, { "epoch": 17.013201320132012, "grad_norm": 1.8887827396392822, "learning_rate": 3.3216461528702753e-06, "loss": 0.0669, "num_input_tokens_seen": 36295840, "step": 36085 }, { "epoch": 17.015558698727016, "grad_norm": 2.0197317600250244, "learning_rate": 3.3165247897647996e-06, "loss": 0.126, "num_input_tokens_seen": 36300704, "step": 36090 }, { "epoch": 17.017916077322017, "grad_norm": 0.3765345811843872, "learning_rate": 3.3114070973035067e-06, "loss": 0.1135, "num_input_tokens_seen": 36305376, "step": 36095 }, { "epoch": 17.02027345591702, "grad_norm": 2.6832821369171143, "learning_rate": 3.30629307635274e-06, "loss": 0.1416, "num_input_tokens_seen": 36310528, "step": 36100 }, { "epoch": 17.02263083451202, "grad_norm": 0.1610984355211258, "learning_rate": 3.3011827277782183e-06, "loss": 0.0411, "num_input_tokens_seen": 36316032, "step": 36105 }, { "epoch": 17.024988213107026, "grad_norm": 0.9252163767814636, "learning_rate": 3.296076052445035e-06, "loss": 0.1029, "num_input_tokens_seen": 36322656, "step": 36110 }, { "epoch": 17.027345591702026, "grad_norm": 0.5241466164588928, "learning_rate": 3.2909730512176664e-06, "loss": 0.127, "num_input_tokens_seen": 36327040, "step": 36115 }, { "epoch": 17.02970297029703, "grad_norm": 0.28003305196762085, "learning_rate": 3.285873724959959e-06, "loss": 0.1527, "num_input_tokens_seen": 36332480, "step": 36120 }, { "epoch": 17.03206034889203, "grad_norm": 0.16901895403862, "learning_rate": 3.2807780745351455e-06, "loss": 0.0233, "num_input_tokens_seen": 36337312, "step": 36125 }, { "epoch": 17.034417727487035, "grad_norm": 0.06757380068302155, "learning_rate": 3.2756861008058415e-06, "loss": 0.0806, "num_input_tokens_seen": 36342272, "step": 36130 }, { "epoch": 17.036775106082036, "grad_norm": 0.20916883647441864, "learning_rate": 3.2705978046340153e-06, "loss": 0.0458, "num_input_tokens_seen": 36346880, "step": 36135 }, { "epoch": 17.03913248467704, "grad_norm": 1.2193448543548584, "learning_rate": 3.2655131868810363e-06, "loss": 0.0436, "num_input_tokens_seen": 36354240, "step": 36140 }, { "epoch": 17.04148986327204, "grad_norm": 0.9367650747299194, "learning_rate": 3.2604322484076437e-06, "loss": 0.0236, "num_input_tokens_seen": 36358912, "step": 36145 }, { "epoch": 17.043847241867045, "grad_norm": 0.5111185908317566, "learning_rate": 3.2553549900739555e-06, "loss": 0.1373, "num_input_tokens_seen": 36363456, "step": 36150 }, { "epoch": 17.046204620462046, "grad_norm": 0.29810500144958496, "learning_rate": 3.2502814127394644e-06, "loss": 0.1078, "num_input_tokens_seen": 36367744, "step": 36155 }, { "epoch": 17.04856199905705, "grad_norm": 1.1833117008209229, "learning_rate": 3.245211517263039e-06, "loss": 0.1092, "num_input_tokens_seen": 36373056, "step": 36160 }, { "epoch": 17.05091937765205, "grad_norm": 4.413809776306152, "learning_rate": 3.240145304502931e-06, "loss": 0.129, "num_input_tokens_seen": 36379392, "step": 36165 }, { "epoch": 17.053276756247055, "grad_norm": 0.06951724737882614, "learning_rate": 3.235082775316764e-06, "loss": 0.1901, "num_input_tokens_seen": 36384320, "step": 36170 }, { "epoch": 17.055634134842055, "grad_norm": 0.6731846928596497, "learning_rate": 3.2300239305615348e-06, "loss": 0.1456, "num_input_tokens_seen": 36391168, "step": 36175 }, { "epoch": 17.05799151343706, "grad_norm": 0.24683710932731628, "learning_rate": 3.224968771093617e-06, "loss": 0.0304, "num_input_tokens_seen": 36396768, "step": 36180 }, { "epoch": 17.06034889203206, "grad_norm": 0.32526931166648865, "learning_rate": 3.2199172977687707e-06, "loss": 0.073, "num_input_tokens_seen": 36401952, "step": 36185 }, { "epoch": 17.062706270627064, "grad_norm": 0.18120864033699036, "learning_rate": 3.2148695114421228e-06, "loss": 0.2574, "num_input_tokens_seen": 36406368, "step": 36190 }, { "epoch": 17.065063649222065, "grad_norm": 0.24083639681339264, "learning_rate": 3.209825412968173e-06, "loss": 0.1056, "num_input_tokens_seen": 36410688, "step": 36195 }, { "epoch": 17.06742102781707, "grad_norm": 1.4334461688995361, "learning_rate": 3.2047850032008054e-06, "loss": 0.1517, "num_input_tokens_seen": 36416096, "step": 36200 }, { "epoch": 17.06977840641207, "grad_norm": 0.4055107533931732, "learning_rate": 3.199748282993273e-06, "loss": 0.0287, "num_input_tokens_seen": 36421440, "step": 36205 }, { "epoch": 17.072135785007074, "grad_norm": 0.01896245777606964, "learning_rate": 3.1947152531982106e-06, "loss": 0.0785, "num_input_tokens_seen": 36426880, "step": 36210 }, { "epoch": 17.074493163602074, "grad_norm": 0.8616403937339783, "learning_rate": 3.189685914667623e-06, "loss": 0.1754, "num_input_tokens_seen": 36431936, "step": 36215 }, { "epoch": 17.076850542197075, "grad_norm": 1.0240554809570312, "learning_rate": 3.1846602682528983e-06, "loss": 0.1233, "num_input_tokens_seen": 36436352, "step": 36220 }, { "epoch": 17.07920792079208, "grad_norm": 0.07786203175783157, "learning_rate": 3.179638314804778e-06, "loss": 0.0349, "num_input_tokens_seen": 36441312, "step": 36225 }, { "epoch": 17.08156529938708, "grad_norm": 0.03871079906821251, "learning_rate": 3.174620055173405e-06, "loss": 0.1784, "num_input_tokens_seen": 36446528, "step": 36230 }, { "epoch": 17.083922677982084, "grad_norm": 0.2507602572441101, "learning_rate": 3.1696054902082828e-06, "loss": 0.0465, "num_input_tokens_seen": 36451328, "step": 36235 }, { "epoch": 17.086280056577085, "grad_norm": 0.13188038766384125, "learning_rate": 3.1645946207582932e-06, "loss": 0.2452, "num_input_tokens_seen": 36456320, "step": 36240 }, { "epoch": 17.08863743517209, "grad_norm": 0.3468082845211029, "learning_rate": 3.1595874476716888e-06, "loss": 0.0807, "num_input_tokens_seen": 36463392, "step": 36245 }, { "epoch": 17.09099481376709, "grad_norm": 0.07356608659029007, "learning_rate": 3.154583971796099e-06, "loss": 0.068, "num_input_tokens_seen": 36467456, "step": 36250 }, { "epoch": 17.093352192362094, "grad_norm": 1.4375801086425781, "learning_rate": 3.14958419397853e-06, "loss": 0.0627, "num_input_tokens_seen": 36472800, "step": 36255 }, { "epoch": 17.095709570957094, "grad_norm": 1.547203540802002, "learning_rate": 3.144588115065364e-06, "loss": 0.187, "num_input_tokens_seen": 36477216, "step": 36260 }, { "epoch": 17.0980669495521, "grad_norm": 1.984131097793579, "learning_rate": 3.13959573590234e-06, "loss": 0.1742, "num_input_tokens_seen": 36481920, "step": 36265 }, { "epoch": 17.1004243281471, "grad_norm": 0.406935453414917, "learning_rate": 3.1346070573345904e-06, "loss": 0.1886, "num_input_tokens_seen": 36487616, "step": 36270 }, { "epoch": 17.102781706742103, "grad_norm": 0.8695763349533081, "learning_rate": 3.129622080206612e-06, "loss": 0.0735, "num_input_tokens_seen": 36492672, "step": 36275 }, { "epoch": 17.105139085337104, "grad_norm": 0.8180432319641113, "learning_rate": 3.124640805362278e-06, "loss": 0.0869, "num_input_tokens_seen": 36497472, "step": 36280 }, { "epoch": 17.107496463932108, "grad_norm": 0.6121320724487305, "learning_rate": 3.1196632336448317e-06, "loss": 0.0942, "num_input_tokens_seen": 36502592, "step": 36285 }, { "epoch": 17.10985384252711, "grad_norm": 0.0649067685008049, "learning_rate": 3.1146893658968947e-06, "loss": 0.1563, "num_input_tokens_seen": 36507872, "step": 36290 }, { "epoch": 17.112211221122113, "grad_norm": 0.3604186773300171, "learning_rate": 3.1097192029604566e-06, "loss": 0.049, "num_input_tokens_seen": 36512672, "step": 36295 }, { "epoch": 17.114568599717114, "grad_norm": 1.453381896018982, "learning_rate": 3.1047527456768864e-06, "loss": 0.1087, "num_input_tokens_seen": 36518336, "step": 36300 }, { "epoch": 17.116925978312118, "grad_norm": 0.9794089794158936, "learning_rate": 3.099789994886909e-06, "loss": 0.1578, "num_input_tokens_seen": 36522816, "step": 36305 }, { "epoch": 17.11928335690712, "grad_norm": 1.4626784324645996, "learning_rate": 3.094830951430644e-06, "loss": 0.1327, "num_input_tokens_seen": 36529472, "step": 36310 }, { "epoch": 17.121640735502123, "grad_norm": 0.4096633493900299, "learning_rate": 3.0898756161475696e-06, "loss": 0.1496, "num_input_tokens_seen": 36534496, "step": 36315 }, { "epoch": 17.123998114097123, "grad_norm": 3.1056933403015137, "learning_rate": 3.0849239898765397e-06, "loss": 0.2527, "num_input_tokens_seen": 36540128, "step": 36320 }, { "epoch": 17.126355492692127, "grad_norm": 0.7417102456092834, "learning_rate": 3.0799760734557843e-06, "loss": 0.1446, "num_input_tokens_seen": 36545440, "step": 36325 }, { "epoch": 17.128712871287128, "grad_norm": 1.5902938842773438, "learning_rate": 3.0750318677228967e-06, "loss": 0.2981, "num_input_tokens_seen": 36550272, "step": 36330 }, { "epoch": 17.131070249882132, "grad_norm": 0.08570697158575058, "learning_rate": 3.070091373514847e-06, "loss": 0.0751, "num_input_tokens_seen": 36555168, "step": 36335 }, { "epoch": 17.133427628477133, "grad_norm": 2.363826036453247, "learning_rate": 3.0651545916679876e-06, "loss": 0.1866, "num_input_tokens_seen": 36559328, "step": 36340 }, { "epoch": 17.135785007072137, "grad_norm": 2.6140329837799072, "learning_rate": 3.060221523018017e-06, "loss": 0.0994, "num_input_tokens_seen": 36564288, "step": 36345 }, { "epoch": 17.138142385667138, "grad_norm": 1.38038969039917, "learning_rate": 3.055292168400026e-06, "loss": 0.0524, "num_input_tokens_seen": 36569024, "step": 36350 }, { "epoch": 17.14049976426214, "grad_norm": 0.14208362996578217, "learning_rate": 3.0503665286484712e-06, "loss": 0.0578, "num_input_tokens_seen": 36573536, "step": 36355 }, { "epoch": 17.142857142857142, "grad_norm": 0.035879816859960556, "learning_rate": 3.045444604597181e-06, "loss": 0.0286, "num_input_tokens_seen": 36578304, "step": 36360 }, { "epoch": 17.145214521452147, "grad_norm": 1.0523360967636108, "learning_rate": 3.040526397079349e-06, "loss": 0.0684, "num_input_tokens_seen": 36583072, "step": 36365 }, { "epoch": 17.147571900047147, "grad_norm": 0.35598012804985046, "learning_rate": 3.035611906927549e-06, "loss": 0.1674, "num_input_tokens_seen": 36587808, "step": 36370 }, { "epoch": 17.14992927864215, "grad_norm": 0.04293471574783325, "learning_rate": 3.030701134973721e-06, "loss": 0.0144, "num_input_tokens_seen": 36592896, "step": 36375 }, { "epoch": 17.152286657237152, "grad_norm": 0.05874054506421089, "learning_rate": 3.025794082049177e-06, "loss": 0.0197, "num_input_tokens_seen": 36596896, "step": 36380 }, { "epoch": 17.154644035832156, "grad_norm": 0.054619092494249344, "learning_rate": 3.0208907489845865e-06, "loss": 0.1022, "num_input_tokens_seen": 36602304, "step": 36385 }, { "epoch": 17.157001414427157, "grad_norm": 0.7015122175216675, "learning_rate": 3.0159911366100107e-06, "loss": 0.2282, "num_input_tokens_seen": 36607488, "step": 36390 }, { "epoch": 17.15935879302216, "grad_norm": 1.1726809740066528, "learning_rate": 3.0110952457548693e-06, "loss": 0.1259, "num_input_tokens_seen": 36613120, "step": 36395 }, { "epoch": 17.16171617161716, "grad_norm": 0.0770367756485939, "learning_rate": 3.0062030772479517e-06, "loss": 0.0784, "num_input_tokens_seen": 36617472, "step": 36400 }, { "epoch": 17.164073550212166, "grad_norm": 0.07456476986408234, "learning_rate": 3.0013146319174178e-06, "loss": 0.1435, "num_input_tokens_seen": 36622656, "step": 36405 }, { "epoch": 17.166430928807166, "grad_norm": 0.08698096126317978, "learning_rate": 2.9964299105908035e-06, "loss": 0.0407, "num_input_tokens_seen": 36627264, "step": 36410 }, { "epoch": 17.16878830740217, "grad_norm": 0.468903511762619, "learning_rate": 2.9915489140950077e-06, "loss": 0.0815, "num_input_tokens_seen": 36632768, "step": 36415 }, { "epoch": 17.17114568599717, "grad_norm": 1.585028886795044, "learning_rate": 2.9866716432562947e-06, "loss": 0.1446, "num_input_tokens_seen": 36637952, "step": 36420 }, { "epoch": 17.173503064592172, "grad_norm": 0.6854327321052551, "learning_rate": 2.9817980989003127e-06, "loss": 0.1378, "num_input_tokens_seen": 36642752, "step": 36425 }, { "epoch": 17.175860443187176, "grad_norm": 0.022645821794867516, "learning_rate": 2.9769282818520595e-06, "loss": 0.0209, "num_input_tokens_seen": 36648288, "step": 36430 }, { "epoch": 17.178217821782177, "grad_norm": 2.0516889095306396, "learning_rate": 2.9720621929359176e-06, "loss": 0.2829, "num_input_tokens_seen": 36653056, "step": 36435 }, { "epoch": 17.18057520037718, "grad_norm": 2.0676894187927246, "learning_rate": 2.967199832975634e-06, "loss": 0.0769, "num_input_tokens_seen": 36658336, "step": 36440 }, { "epoch": 17.18293257897218, "grad_norm": 0.22146350145339966, "learning_rate": 2.962341202794322e-06, "loss": 0.1524, "num_input_tokens_seen": 36664096, "step": 36445 }, { "epoch": 17.185289957567186, "grad_norm": 0.9055786728858948, "learning_rate": 2.9574863032144683e-06, "loss": 0.1455, "num_input_tokens_seen": 36668736, "step": 36450 }, { "epoch": 17.187647336162186, "grad_norm": 1.9684628248214722, "learning_rate": 2.9526351350579214e-06, "loss": 0.1085, "num_input_tokens_seen": 36672576, "step": 36455 }, { "epoch": 17.19000471475719, "grad_norm": 2.4699208736419678, "learning_rate": 2.9477876991459043e-06, "loss": 0.0901, "num_input_tokens_seen": 36678176, "step": 36460 }, { "epoch": 17.19236209335219, "grad_norm": 0.43266746401786804, "learning_rate": 2.942943996299008e-06, "loss": 0.0393, "num_input_tokens_seen": 36682944, "step": 36465 }, { "epoch": 17.194719471947195, "grad_norm": 1.5487446784973145, "learning_rate": 2.9381040273371794e-06, "loss": 0.126, "num_input_tokens_seen": 36687360, "step": 36470 }, { "epoch": 17.197076850542196, "grad_norm": 3.2303595542907715, "learning_rate": 2.9332677930797527e-06, "loss": 0.109, "num_input_tokens_seen": 36692352, "step": 36475 }, { "epoch": 17.1994342291372, "grad_norm": 0.04670662805438042, "learning_rate": 2.9284352943454134e-06, "loss": 0.0801, "num_input_tokens_seen": 36697856, "step": 36480 }, { "epoch": 17.2017916077322, "grad_norm": 0.10469982028007507, "learning_rate": 2.9236065319522276e-06, "loss": 0.0458, "num_input_tokens_seen": 36702400, "step": 36485 }, { "epoch": 17.204148986327205, "grad_norm": 0.07469480484724045, "learning_rate": 2.918781506717619e-06, "loss": 0.1732, "num_input_tokens_seen": 36707680, "step": 36490 }, { "epoch": 17.206506364922205, "grad_norm": 0.3197559416294098, "learning_rate": 2.9139602194583835e-06, "loss": 0.1125, "num_input_tokens_seen": 36711840, "step": 36495 }, { "epoch": 17.20886374351721, "grad_norm": 0.05153609812259674, "learning_rate": 2.909142670990683e-06, "loss": 0.1223, "num_input_tokens_seen": 36718880, "step": 36500 }, { "epoch": 17.21122112211221, "grad_norm": 2.498843193054199, "learning_rate": 2.904328862130054e-06, "loss": 0.2773, "num_input_tokens_seen": 36723840, "step": 36505 }, { "epoch": 17.213578500707214, "grad_norm": 0.4765566289424896, "learning_rate": 2.8995187936913778e-06, "loss": 0.086, "num_input_tokens_seen": 36728352, "step": 36510 }, { "epoch": 17.215935879302215, "grad_norm": 0.21346555650234222, "learning_rate": 2.8947124664889274e-06, "loss": 0.1999, "num_input_tokens_seen": 36733312, "step": 36515 }, { "epoch": 17.21829325789722, "grad_norm": 0.5843029022216797, "learning_rate": 2.8899098813363278e-06, "loss": 0.0632, "num_input_tokens_seen": 36737984, "step": 36520 }, { "epoch": 17.22065063649222, "grad_norm": 1.2032558917999268, "learning_rate": 2.8851110390465784e-06, "loss": 0.0906, "num_input_tokens_seen": 36743296, "step": 36525 }, { "epoch": 17.223008015087224, "grad_norm": 1.8088406324386597, "learning_rate": 2.8803159404320406e-06, "loss": 0.1618, "num_input_tokens_seen": 36747392, "step": 36530 }, { "epoch": 17.225365393682225, "grad_norm": 0.3496721386909485, "learning_rate": 2.8755245863044426e-06, "loss": 0.1747, "num_input_tokens_seen": 36754400, "step": 36535 }, { "epoch": 17.22772277227723, "grad_norm": 0.09527716040611267, "learning_rate": 2.8707369774748806e-06, "loss": 0.0313, "num_input_tokens_seen": 36759168, "step": 36540 }, { "epoch": 17.23008015087223, "grad_norm": 0.14701929688453674, "learning_rate": 2.8659531147538192e-06, "loss": 0.215, "num_input_tokens_seen": 36763936, "step": 36545 }, { "epoch": 17.232437529467234, "grad_norm": 0.15118223428726196, "learning_rate": 2.861172998951078e-06, "loss": 0.0612, "num_input_tokens_seen": 36768512, "step": 36550 }, { "epoch": 17.234794908062234, "grad_norm": 0.40191638469696045, "learning_rate": 2.8563966308758487e-06, "loss": 0.0757, "num_input_tokens_seen": 36773248, "step": 36555 }, { "epoch": 17.23715228665724, "grad_norm": 0.11793775111436844, "learning_rate": 2.8516240113366955e-06, "loss": 0.0669, "num_input_tokens_seen": 36777376, "step": 36560 }, { "epoch": 17.23950966525224, "grad_norm": 0.12109632790088654, "learning_rate": 2.8468551411415384e-06, "loss": 0.0861, "num_input_tokens_seen": 36782656, "step": 36565 }, { "epoch": 17.241867043847243, "grad_norm": 0.11977456510066986, "learning_rate": 2.842090021097668e-06, "loss": 0.1122, "num_input_tokens_seen": 36787616, "step": 36570 }, { "epoch": 17.244224422442244, "grad_norm": 1.2696126699447632, "learning_rate": 2.8373286520117336e-06, "loss": 0.0642, "num_input_tokens_seen": 36793408, "step": 36575 }, { "epoch": 17.246581801037248, "grad_norm": 1.7737696170806885, "learning_rate": 2.83257103468976e-06, "loss": 0.0818, "num_input_tokens_seen": 36798208, "step": 36580 }, { "epoch": 17.24893917963225, "grad_norm": 0.3726450502872467, "learning_rate": 2.827817169937136e-06, "loss": 0.1429, "num_input_tokens_seen": 36803808, "step": 36585 }, { "epoch": 17.251296558227253, "grad_norm": 1.9463396072387695, "learning_rate": 2.8230670585585927e-06, "loss": 0.1596, "num_input_tokens_seen": 36809728, "step": 36590 }, { "epoch": 17.253653936822253, "grad_norm": 0.05216916650533676, "learning_rate": 2.818320701358257e-06, "loss": 0.046, "num_input_tokens_seen": 36815232, "step": 36595 }, { "epoch": 17.256011315417258, "grad_norm": 1.6517884731292725, "learning_rate": 2.813578099139602e-06, "loss": 0.1355, "num_input_tokens_seen": 36822336, "step": 36600 }, { "epoch": 17.25836869401226, "grad_norm": 2.361706256866455, "learning_rate": 2.8088392527054723e-06, "loss": 0.2503, "num_input_tokens_seen": 36827328, "step": 36605 }, { "epoch": 17.260726072607262, "grad_norm": 0.012324494309723377, "learning_rate": 2.8041041628580707e-06, "loss": 0.2512, "num_input_tokens_seen": 36831712, "step": 36610 }, { "epoch": 17.263083451202263, "grad_norm": 0.09707699716091156, "learning_rate": 2.7993728303989717e-06, "loss": 0.1717, "num_input_tokens_seen": 36836192, "step": 36615 }, { "epoch": 17.265440829797264, "grad_norm": 2.874293088912964, "learning_rate": 2.7946452561291053e-06, "loss": 0.0808, "num_input_tokens_seen": 36841056, "step": 36620 }, { "epoch": 17.267798208392268, "grad_norm": 0.05849717557430267, "learning_rate": 2.789921440848778e-06, "loss": 0.1838, "num_input_tokens_seen": 36845984, "step": 36625 }, { "epoch": 17.27015558698727, "grad_norm": 1.1360588073730469, "learning_rate": 2.7852013853576388e-06, "loss": 0.1331, "num_input_tokens_seen": 36851392, "step": 36630 }, { "epoch": 17.272512965582273, "grad_norm": 0.12449485808610916, "learning_rate": 2.780485090454721e-06, "loss": 0.0739, "num_input_tokens_seen": 36855776, "step": 36635 }, { "epoch": 17.274870344177273, "grad_norm": 2.1988933086395264, "learning_rate": 2.7757725569384135e-06, "loss": 0.1806, "num_input_tokens_seen": 36861312, "step": 36640 }, { "epoch": 17.277227722772277, "grad_norm": 0.10684385895729065, "learning_rate": 2.771063785606465e-06, "loss": 0.035, "num_input_tokens_seen": 36866432, "step": 36645 }, { "epoch": 17.279585101367278, "grad_norm": 2.107067108154297, "learning_rate": 2.7663587772559896e-06, "loss": 0.3068, "num_input_tokens_seen": 36872160, "step": 36650 }, { "epoch": 17.281942479962282, "grad_norm": 1.675021767616272, "learning_rate": 2.7616575326834678e-06, "loss": 0.224, "num_input_tokens_seen": 36877216, "step": 36655 }, { "epoch": 17.284299858557283, "grad_norm": 0.32743898034095764, "learning_rate": 2.756960052684737e-06, "loss": 0.0628, "num_input_tokens_seen": 36882272, "step": 36660 }, { "epoch": 17.286657237152287, "grad_norm": 0.022293219342827797, "learning_rate": 2.7522663380550097e-06, "loss": 0.0559, "num_input_tokens_seen": 36886752, "step": 36665 }, { "epoch": 17.289014615747288, "grad_norm": 0.11818710714578629, "learning_rate": 2.7475763895888433e-06, "loss": 0.1824, "num_input_tokens_seen": 36892704, "step": 36670 }, { "epoch": 17.291371994342292, "grad_norm": 0.15593679249286652, "learning_rate": 2.7428902080801734e-06, "loss": 0.1245, "num_input_tokens_seen": 36897568, "step": 36675 }, { "epoch": 17.293729372937293, "grad_norm": 0.1081663966178894, "learning_rate": 2.7382077943222838e-06, "loss": 0.0762, "num_input_tokens_seen": 36903200, "step": 36680 }, { "epoch": 17.296086751532297, "grad_norm": 0.20406325161457062, "learning_rate": 2.73352914910783e-06, "loss": 0.1815, "num_input_tokens_seen": 36908928, "step": 36685 }, { "epoch": 17.298444130127297, "grad_norm": 0.31094685196876526, "learning_rate": 2.7288542732288288e-06, "loss": 0.3232, "num_input_tokens_seen": 36914752, "step": 36690 }, { "epoch": 17.3008015087223, "grad_norm": 1.2428733110427856, "learning_rate": 2.7241831674766573e-06, "loss": 0.0921, "num_input_tokens_seen": 36919136, "step": 36695 }, { "epoch": 17.303158887317302, "grad_norm": 0.09728563576936722, "learning_rate": 2.719515832642053e-06, "loss": 0.0379, "num_input_tokens_seen": 36924352, "step": 36700 }, { "epoch": 17.305516265912306, "grad_norm": 0.5286792516708374, "learning_rate": 2.7148522695151167e-06, "loss": 0.1339, "num_input_tokens_seen": 36929856, "step": 36705 }, { "epoch": 17.307873644507307, "grad_norm": 0.30588221549987793, "learning_rate": 2.7101924788853124e-06, "loss": 0.0488, "num_input_tokens_seen": 36935136, "step": 36710 }, { "epoch": 17.31023102310231, "grad_norm": 0.11084195226430893, "learning_rate": 2.7055364615414635e-06, "loss": 0.0666, "num_input_tokens_seen": 36940320, "step": 36715 }, { "epoch": 17.31258840169731, "grad_norm": 1.273049235343933, "learning_rate": 2.7008842182717514e-06, "loss": 0.2825, "num_input_tokens_seen": 36945024, "step": 36720 }, { "epoch": 17.314945780292316, "grad_norm": 1.4030481576919556, "learning_rate": 2.696235749863724e-06, "loss": 0.1333, "num_input_tokens_seen": 36950368, "step": 36725 }, { "epoch": 17.317303158887317, "grad_norm": 0.16570104658603668, "learning_rate": 2.691591057104287e-06, "loss": 0.0733, "num_input_tokens_seen": 36956928, "step": 36730 }, { "epoch": 17.31966053748232, "grad_norm": 0.255313903093338, "learning_rate": 2.6869501407797064e-06, "loss": 0.0601, "num_input_tokens_seen": 36962624, "step": 36735 }, { "epoch": 17.32201791607732, "grad_norm": 0.38450318574905396, "learning_rate": 2.6823130016756153e-06, "loss": 0.1124, "num_input_tokens_seen": 36967424, "step": 36740 }, { "epoch": 17.324375294672326, "grad_norm": 0.09495844691991806, "learning_rate": 2.6776796405769993e-06, "loss": 0.1017, "num_input_tokens_seen": 36972672, "step": 36745 }, { "epoch": 17.326732673267326, "grad_norm": 0.5026260018348694, "learning_rate": 2.6730500582682115e-06, "loss": 0.0656, "num_input_tokens_seen": 36979456, "step": 36750 }, { "epoch": 17.32909005186233, "grad_norm": 0.8917057514190674, "learning_rate": 2.6684242555329613e-06, "loss": 0.0982, "num_input_tokens_seen": 36984096, "step": 36755 }, { "epoch": 17.33144743045733, "grad_norm": 1.9509668350219727, "learning_rate": 2.663802233154311e-06, "loss": 0.0975, "num_input_tokens_seen": 36989696, "step": 36760 }, { "epoch": 17.333804809052335, "grad_norm": 3.292370319366455, "learning_rate": 2.659183991914696e-06, "loss": 0.1336, "num_input_tokens_seen": 36994304, "step": 36765 }, { "epoch": 17.336162187647336, "grad_norm": 0.11482895910739899, "learning_rate": 2.6545695325959046e-06, "loss": 0.1271, "num_input_tokens_seen": 36999360, "step": 36770 }, { "epoch": 17.33851956624234, "grad_norm": 0.009216786362230778, "learning_rate": 2.64995885597909e-06, "loss": 0.0483, "num_input_tokens_seen": 37004064, "step": 36775 }, { "epoch": 17.34087694483734, "grad_norm": 0.05390036478638649, "learning_rate": 2.6453519628447584e-06, "loss": 0.0829, "num_input_tokens_seen": 37008832, "step": 36780 }, { "epoch": 17.343234323432345, "grad_norm": 1.5145394802093506, "learning_rate": 2.640748853972777e-06, "loss": 0.2678, "num_input_tokens_seen": 37013632, "step": 36785 }, { "epoch": 17.345591702027345, "grad_norm": 0.4188655912876129, "learning_rate": 2.636149530142379e-06, "loss": 0.0216, "num_input_tokens_seen": 37018848, "step": 36790 }, { "epoch": 17.34794908062235, "grad_norm": 0.6611292958259583, "learning_rate": 2.631553992132152e-06, "loss": 0.0276, "num_input_tokens_seen": 37024192, "step": 36795 }, { "epoch": 17.35030645921735, "grad_norm": 0.05088347941637039, "learning_rate": 2.626962240720035e-06, "loss": 0.1096, "num_input_tokens_seen": 37030336, "step": 36800 }, { "epoch": 17.352663837812354, "grad_norm": 0.20746396481990814, "learning_rate": 2.622374276683337e-06, "loss": 0.1294, "num_input_tokens_seen": 37034880, "step": 36805 }, { "epoch": 17.355021216407355, "grad_norm": 0.20948578417301178, "learning_rate": 2.6177901007987256e-06, "loss": 0.0225, "num_input_tokens_seen": 37040288, "step": 36810 }, { "epoch": 17.35737859500236, "grad_norm": 0.027313711121678352, "learning_rate": 2.613209713842221e-06, "loss": 0.069, "num_input_tokens_seen": 37045888, "step": 36815 }, { "epoch": 17.35973597359736, "grad_norm": 0.2998294532299042, "learning_rate": 2.6086331165892043e-06, "loss": 0.2394, "num_input_tokens_seen": 37050336, "step": 36820 }, { "epoch": 17.36209335219236, "grad_norm": 0.24713729321956635, "learning_rate": 2.604060309814421e-06, "loss": 0.0836, "num_input_tokens_seen": 37054912, "step": 36825 }, { "epoch": 17.364450730787365, "grad_norm": 1.7808085680007935, "learning_rate": 2.5994912942919615e-06, "loss": 0.1384, "num_input_tokens_seen": 37060352, "step": 36830 }, { "epoch": 17.366808109382365, "grad_norm": 2.3668055534362793, "learning_rate": 2.5949260707952947e-06, "loss": 0.2431, "num_input_tokens_seen": 37065056, "step": 36835 }, { "epoch": 17.36916548797737, "grad_norm": 0.05833780765533447, "learning_rate": 2.59036464009722e-06, "loss": 0.0444, "num_input_tokens_seen": 37070176, "step": 36840 }, { "epoch": 17.37152286657237, "grad_norm": 0.9814685583114624, "learning_rate": 2.58580700296992e-06, "loss": 0.0699, "num_input_tokens_seen": 37074752, "step": 36845 }, { "epoch": 17.373880245167374, "grad_norm": 1.9910318851470947, "learning_rate": 2.581253160184924e-06, "loss": 0.1453, "num_input_tokens_seen": 37079104, "step": 36850 }, { "epoch": 17.376237623762375, "grad_norm": 0.3153560161590576, "learning_rate": 2.576703112513118e-06, "loss": 0.0933, "num_input_tokens_seen": 37085440, "step": 36855 }, { "epoch": 17.37859500235738, "grad_norm": 0.6325940489768982, "learning_rate": 2.5721568607247476e-06, "loss": 0.1136, "num_input_tokens_seen": 37090336, "step": 36860 }, { "epoch": 17.38095238095238, "grad_norm": 0.1869911253452301, "learning_rate": 2.5676144055894247e-06, "loss": 0.1046, "num_input_tokens_seen": 37095360, "step": 36865 }, { "epoch": 17.383309759547384, "grad_norm": 0.1464136689901352, "learning_rate": 2.563075747876098e-06, "loss": 0.0258, "num_input_tokens_seen": 37099264, "step": 36870 }, { "epoch": 17.385667138142384, "grad_norm": 0.36438149213790894, "learning_rate": 2.5585408883530893e-06, "loss": 0.066, "num_input_tokens_seen": 37104064, "step": 36875 }, { "epoch": 17.38802451673739, "grad_norm": 0.6380863189697266, "learning_rate": 2.554009827788073e-06, "loss": 0.1526, "num_input_tokens_seen": 37108320, "step": 36880 }, { "epoch": 17.39038189533239, "grad_norm": 0.1278907060623169, "learning_rate": 2.549482566948086e-06, "loss": 0.022, "num_input_tokens_seen": 37112864, "step": 36885 }, { "epoch": 17.392739273927393, "grad_norm": 0.1514025330543518, "learning_rate": 2.5449591065995095e-06, "loss": 0.159, "num_input_tokens_seen": 37117376, "step": 36890 }, { "epoch": 17.395096652522394, "grad_norm": 2.668001413345337, "learning_rate": 2.540439447508089e-06, "loss": 0.1468, "num_input_tokens_seen": 37122496, "step": 36895 }, { "epoch": 17.397454031117398, "grad_norm": 0.08595853298902512, "learning_rate": 2.5359235904389324e-06, "loss": 0.0268, "num_input_tokens_seen": 37127392, "step": 36900 }, { "epoch": 17.3998114097124, "grad_norm": 0.06735117733478546, "learning_rate": 2.531411536156489e-06, "loss": 0.1523, "num_input_tokens_seen": 37132352, "step": 36905 }, { "epoch": 17.402168788307403, "grad_norm": 1.3406749963760376, "learning_rate": 2.526903285424581e-06, "loss": 0.1931, "num_input_tokens_seen": 37138048, "step": 36910 }, { "epoch": 17.404526166902404, "grad_norm": 0.4399259388446808, "learning_rate": 2.5223988390063734e-06, "loss": 0.0717, "num_input_tokens_seen": 37142624, "step": 36915 }, { "epoch": 17.406883545497408, "grad_norm": 0.4435631036758423, "learning_rate": 2.517898197664395e-06, "loss": 0.1573, "num_input_tokens_seen": 37146976, "step": 36920 }, { "epoch": 17.40924092409241, "grad_norm": 0.7270975112915039, "learning_rate": 2.5134013621605305e-06, "loss": 0.1062, "num_input_tokens_seen": 37151712, "step": 36925 }, { "epoch": 17.411598302687413, "grad_norm": 0.12307216972112656, "learning_rate": 2.508908333256013e-06, "loss": 0.0852, "num_input_tokens_seen": 37156512, "step": 36930 }, { "epoch": 17.413955681282413, "grad_norm": 0.1449577957391739, "learning_rate": 2.504419111711437e-06, "loss": 0.1313, "num_input_tokens_seen": 37160352, "step": 36935 }, { "epoch": 17.416313059877417, "grad_norm": 0.053394317626953125, "learning_rate": 2.4999336982867495e-06, "loss": 0.0442, "num_input_tokens_seen": 37165920, "step": 36940 }, { "epoch": 17.418670438472418, "grad_norm": 0.2754426598548889, "learning_rate": 2.49545209374126e-06, "loss": 0.0791, "num_input_tokens_seen": 37171552, "step": 36945 }, { "epoch": 17.421027817067422, "grad_norm": 0.4212040305137634, "learning_rate": 2.490974298833626e-06, "loss": 0.116, "num_input_tokens_seen": 37175936, "step": 36950 }, { "epoch": 17.423385195662423, "grad_norm": 0.4627947509288788, "learning_rate": 2.48650031432186e-06, "loss": 0.045, "num_input_tokens_seen": 37180960, "step": 36955 }, { "epoch": 17.425742574257427, "grad_norm": 1.3651471138000488, "learning_rate": 2.4820301409633346e-06, "loss": 0.0277, "num_input_tokens_seen": 37187744, "step": 36960 }, { "epoch": 17.428099952852428, "grad_norm": 0.06324585527181625, "learning_rate": 2.4775637795147773e-06, "loss": 0.041, "num_input_tokens_seen": 37193376, "step": 36965 }, { "epoch": 17.430457331447432, "grad_norm": 0.7671775817871094, "learning_rate": 2.4731012307322584e-06, "loss": 0.1953, "num_input_tokens_seen": 37199424, "step": 36970 }, { "epoch": 17.432814710042432, "grad_norm": 0.021665193140506744, "learning_rate": 2.4686424953712152e-06, "loss": 0.0081, "num_input_tokens_seen": 37204896, "step": 36975 }, { "epoch": 17.435172088637437, "grad_norm": 1.1600462198257446, "learning_rate": 2.464187574186436e-06, "loss": 0.0441, "num_input_tokens_seen": 37209568, "step": 36980 }, { "epoch": 17.437529467232437, "grad_norm": 0.4819384217262268, "learning_rate": 2.4597364679320668e-06, "loss": 0.098, "num_input_tokens_seen": 37214592, "step": 36985 }, { "epoch": 17.43988684582744, "grad_norm": 0.11090562492609024, "learning_rate": 2.4552891773616e-06, "loss": 0.1415, "num_input_tokens_seen": 37219104, "step": 36990 }, { "epoch": 17.442244224422442, "grad_norm": 2.8604748249053955, "learning_rate": 2.4508457032278885e-06, "loss": 0.0919, "num_input_tokens_seen": 37223648, "step": 36995 }, { "epoch": 17.444601603017446, "grad_norm": 0.2796039283275604, "learning_rate": 2.4464060462831365e-06, "loss": 0.0986, "num_input_tokens_seen": 37229184, "step": 37000 }, { "epoch": 17.446958981612447, "grad_norm": 1.080681324005127, "learning_rate": 2.441970207278907e-06, "loss": 0.1359, "num_input_tokens_seen": 37233824, "step": 37005 }, { "epoch": 17.44931636020745, "grad_norm": 0.11173827946186066, "learning_rate": 2.437538186966104e-06, "loss": 0.1057, "num_input_tokens_seen": 37239200, "step": 37010 }, { "epoch": 17.45167373880245, "grad_norm": 0.11892445385456085, "learning_rate": 2.4331099860949973e-06, "loss": 0.0483, "num_input_tokens_seen": 37245728, "step": 37015 }, { "epoch": 17.454031117397456, "grad_norm": 0.0332178957760334, "learning_rate": 2.428685605415204e-06, "loss": 0.1223, "num_input_tokens_seen": 37250848, "step": 37020 }, { "epoch": 17.456388495992456, "grad_norm": 2.240708589553833, "learning_rate": 2.4242650456757027e-06, "loss": 0.0964, "num_input_tokens_seen": 37255872, "step": 37025 }, { "epoch": 17.458745874587457, "grad_norm": 0.9071909785270691, "learning_rate": 2.4198483076248114e-06, "loss": 0.1786, "num_input_tokens_seen": 37261856, "step": 37030 }, { "epoch": 17.46110325318246, "grad_norm": 0.09467419236898422, "learning_rate": 2.4154353920102125e-06, "loss": 0.1455, "num_input_tokens_seen": 37267584, "step": 37035 }, { "epoch": 17.463460631777462, "grad_norm": 1.0985627174377441, "learning_rate": 2.411026299578939e-06, "loss": 0.0458, "num_input_tokens_seen": 37274240, "step": 37040 }, { "epoch": 17.465818010372466, "grad_norm": 1.9017434120178223, "learning_rate": 2.4066210310773774e-06, "loss": 0.1788, "num_input_tokens_seen": 37278368, "step": 37045 }, { "epoch": 17.468175388967467, "grad_norm": 1.1868400573730469, "learning_rate": 2.402219587251259e-06, "loss": 0.177, "num_input_tokens_seen": 37283200, "step": 37050 }, { "epoch": 17.47053276756247, "grad_norm": 0.49691659212112427, "learning_rate": 2.397821968845676e-06, "loss": 0.0424, "num_input_tokens_seen": 37287712, "step": 37055 }, { "epoch": 17.47289014615747, "grad_norm": 0.0846133828163147, "learning_rate": 2.393428176605067e-06, "loss": 0.1846, "num_input_tokens_seen": 37292864, "step": 37060 }, { "epoch": 17.475247524752476, "grad_norm": 1.370449423789978, "learning_rate": 2.389038211273234e-06, "loss": 0.1471, "num_input_tokens_seen": 37298400, "step": 37065 }, { "epoch": 17.477604903347476, "grad_norm": 0.2509860396385193, "learning_rate": 2.384652073593316e-06, "loss": 0.1859, "num_input_tokens_seen": 37303072, "step": 37070 }, { "epoch": 17.47996228194248, "grad_norm": 1.6206567287445068, "learning_rate": 2.380269764307819e-06, "loss": 0.102, "num_input_tokens_seen": 37307456, "step": 37075 }, { "epoch": 17.48231966053748, "grad_norm": 0.3123500943183899, "learning_rate": 2.375891284158588e-06, "loss": 0.0942, "num_input_tokens_seen": 37312640, "step": 37080 }, { "epoch": 17.484677039132485, "grad_norm": 0.06133337318897247, "learning_rate": 2.3715166338868305e-06, "loss": 0.0208, "num_input_tokens_seen": 37317920, "step": 37085 }, { "epoch": 17.487034417727486, "grad_norm": 0.11075370758771896, "learning_rate": 2.3671458142330934e-06, "loss": 0.1183, "num_input_tokens_seen": 37323168, "step": 37090 }, { "epoch": 17.48939179632249, "grad_norm": 0.023044606670737267, "learning_rate": 2.3627788259372896e-06, "loss": 0.0735, "num_input_tokens_seen": 37327808, "step": 37095 }, { "epoch": 17.49174917491749, "grad_norm": 0.1288710981607437, "learning_rate": 2.35841566973867e-06, "loss": 0.127, "num_input_tokens_seen": 37332576, "step": 37100 }, { "epoch": 17.494106553512495, "grad_norm": 1.782148838043213, "learning_rate": 2.3540563463758436e-06, "loss": 0.0864, "num_input_tokens_seen": 37336416, "step": 37105 }, { "epoch": 17.496463932107496, "grad_norm": 1.7277027368545532, "learning_rate": 2.349700856586773e-06, "loss": 0.1282, "num_input_tokens_seen": 37340544, "step": 37110 }, { "epoch": 17.4988213107025, "grad_norm": 0.06689425557851791, "learning_rate": 2.3453492011087653e-06, "loss": 0.0668, "num_input_tokens_seen": 37344640, "step": 37115 }, { "epoch": 17.5011786892975, "grad_norm": 1.5778064727783203, "learning_rate": 2.341001380678484e-06, "loss": 0.1042, "num_input_tokens_seen": 37349792, "step": 37120 }, { "epoch": 17.503536067892504, "grad_norm": 0.2878672778606415, "learning_rate": 2.3366573960319438e-06, "loss": 0.2072, "num_input_tokens_seen": 37353824, "step": 37125 }, { "epoch": 17.505893446487505, "grad_norm": 0.23788654804229736, "learning_rate": 2.332317247904503e-06, "loss": 0.086, "num_input_tokens_seen": 37359744, "step": 37130 }, { "epoch": 17.50825082508251, "grad_norm": 0.9384377002716064, "learning_rate": 2.3279809370308796e-06, "loss": 0.2152, "num_input_tokens_seen": 37364480, "step": 37135 }, { "epoch": 17.51060820367751, "grad_norm": 0.13242748379707336, "learning_rate": 2.323648464145131e-06, "loss": 0.119, "num_input_tokens_seen": 37369184, "step": 37140 }, { "epoch": 17.512965582272514, "grad_norm": 0.14782924950122833, "learning_rate": 2.3193198299806763e-06, "loss": 0.1112, "num_input_tokens_seen": 37373440, "step": 37145 }, { "epoch": 17.515322960867515, "grad_norm": 0.02103853039443493, "learning_rate": 2.3149950352702786e-06, "loss": 0.0993, "num_input_tokens_seen": 37378816, "step": 37150 }, { "epoch": 17.51768033946252, "grad_norm": 0.12897217273712158, "learning_rate": 2.3106740807460503e-06, "loss": 0.0393, "num_input_tokens_seen": 37382688, "step": 37155 }, { "epoch": 17.52003771805752, "grad_norm": 0.8908868432044983, "learning_rate": 2.3063569671394584e-06, "loss": 0.132, "num_input_tokens_seen": 37388608, "step": 37160 }, { "epoch": 17.522395096652524, "grad_norm": 0.0531318262219429, "learning_rate": 2.3020436951813133e-06, "loss": 0.0312, "num_input_tokens_seen": 37393120, "step": 37165 }, { "epoch": 17.524752475247524, "grad_norm": 1.0517189502716064, "learning_rate": 2.2977342656017836e-06, "loss": 0.1079, "num_input_tokens_seen": 37398752, "step": 37170 }, { "epoch": 17.52710985384253, "grad_norm": 1.4852224588394165, "learning_rate": 2.2934286791303854e-06, "loss": 0.163, "num_input_tokens_seen": 37403488, "step": 37175 }, { "epoch": 17.52946723243753, "grad_norm": 0.1272098422050476, "learning_rate": 2.2891269364959693e-06, "loss": 0.0564, "num_input_tokens_seen": 37409024, "step": 37180 }, { "epoch": 17.531824611032533, "grad_norm": 0.9212551116943359, "learning_rate": 2.2848290384267557e-06, "loss": 0.086, "num_input_tokens_seen": 37414656, "step": 37185 }, { "epoch": 17.534181989627534, "grad_norm": 0.1329653412103653, "learning_rate": 2.280534985650304e-06, "loss": 0.0967, "num_input_tokens_seen": 37419264, "step": 37190 }, { "epoch": 17.536539368222538, "grad_norm": 1.0864022970199585, "learning_rate": 2.2762447788935244e-06, "loss": 0.0434, "num_input_tokens_seen": 37423776, "step": 37195 }, { "epoch": 17.53889674681754, "grad_norm": 0.7729650139808655, "learning_rate": 2.2719584188826775e-06, "loss": 0.1299, "num_input_tokens_seen": 37428480, "step": 37200 }, { "epoch": 17.541254125412543, "grad_norm": 0.23147538304328918, "learning_rate": 2.2676759063433693e-06, "loss": 0.0322, "num_input_tokens_seen": 37433344, "step": 37205 }, { "epoch": 17.543611504007544, "grad_norm": 0.013576551340520382, "learning_rate": 2.2633972420005562e-06, "loss": 0.1259, "num_input_tokens_seen": 37439104, "step": 37210 }, { "epoch": 17.545968882602544, "grad_norm": 0.23557648062705994, "learning_rate": 2.2591224265785527e-06, "loss": 0.0628, "num_input_tokens_seen": 37444192, "step": 37215 }, { "epoch": 17.54832626119755, "grad_norm": 0.06518076360225677, "learning_rate": 2.254851460800997e-06, "loss": 0.2222, "num_input_tokens_seen": 37448096, "step": 37220 }, { "epoch": 17.55068363979255, "grad_norm": 0.17439143359661102, "learning_rate": 2.2505843453908963e-06, "loss": 0.1642, "num_input_tokens_seen": 37453568, "step": 37225 }, { "epoch": 17.553041018387553, "grad_norm": 0.04641721770167351, "learning_rate": 2.246321081070607e-06, "loss": 0.0331, "num_input_tokens_seen": 37459712, "step": 37230 }, { "epoch": 17.555398396982554, "grad_norm": 0.08138492703437805, "learning_rate": 2.242061668561821e-06, "loss": 0.0692, "num_input_tokens_seen": 37464416, "step": 37235 }, { "epoch": 17.557755775577558, "grad_norm": 0.43920964002609253, "learning_rate": 2.2378061085855858e-06, "loss": 0.1388, "num_input_tokens_seen": 37469760, "step": 37240 }, { "epoch": 17.56011315417256, "grad_norm": 0.034671783447265625, "learning_rate": 2.233554401862298e-06, "loss": 0.0713, "num_input_tokens_seen": 37474816, "step": 37245 }, { "epoch": 17.562470532767563, "grad_norm": 0.18299336731433868, "learning_rate": 2.2293065491116956e-06, "loss": 0.066, "num_input_tokens_seen": 37479808, "step": 37250 }, { "epoch": 17.564827911362563, "grad_norm": 0.0585179328918457, "learning_rate": 2.2250625510528785e-06, "loss": 0.0473, "num_input_tokens_seen": 37484224, "step": 37255 }, { "epoch": 17.567185289957568, "grad_norm": 2.4450559616088867, "learning_rate": 2.2208224084042663e-06, "loss": 0.1085, "num_input_tokens_seen": 37488992, "step": 37260 }, { "epoch": 17.569542668552568, "grad_norm": 1.4988044500350952, "learning_rate": 2.216586121883654e-06, "loss": 0.0701, "num_input_tokens_seen": 37495488, "step": 37265 }, { "epoch": 17.571900047147572, "grad_norm": 0.07502947002649307, "learning_rate": 2.212353692208172e-06, "loss": 0.0636, "num_input_tokens_seen": 37501216, "step": 37270 }, { "epoch": 17.574257425742573, "grad_norm": 0.8961277604103088, "learning_rate": 2.2081251200942952e-06, "loss": 0.0823, "num_input_tokens_seen": 37506272, "step": 37275 }, { "epoch": 17.576614804337577, "grad_norm": 0.12904973328113556, "learning_rate": 2.2039004062578527e-06, "loss": 0.0345, "num_input_tokens_seen": 37511456, "step": 37280 }, { "epoch": 17.578972182932578, "grad_norm": 0.09803476184606552, "learning_rate": 2.199679551414016e-06, "loss": 0.072, "num_input_tokens_seen": 37515648, "step": 37285 }, { "epoch": 17.581329561527582, "grad_norm": 0.15215159952640533, "learning_rate": 2.195462556277303e-06, "loss": 0.0195, "num_input_tokens_seen": 37520768, "step": 37290 }, { "epoch": 17.583686940122583, "grad_norm": 1.8663082122802734, "learning_rate": 2.1912494215615866e-06, "loss": 0.1709, "num_input_tokens_seen": 37526496, "step": 37295 }, { "epoch": 17.586044318717587, "grad_norm": 0.30881327390670776, "learning_rate": 2.1870401479800654e-06, "loss": 0.1929, "num_input_tokens_seen": 37531552, "step": 37300 }, { "epoch": 17.588401697312587, "grad_norm": 0.05821537971496582, "learning_rate": 2.1828347362453095e-06, "loss": 0.1351, "num_input_tokens_seen": 37536416, "step": 37305 }, { "epoch": 17.59075907590759, "grad_norm": 1.6749622821807861, "learning_rate": 2.178633187069215e-06, "loss": 0.0667, "num_input_tokens_seen": 37540640, "step": 37310 }, { "epoch": 17.593116454502592, "grad_norm": 0.7167164087295532, "learning_rate": 2.174435501163044e-06, "loss": 0.0238, "num_input_tokens_seen": 37545632, "step": 37315 }, { "epoch": 17.595473833097596, "grad_norm": 0.04516315460205078, "learning_rate": 2.170241679237381e-06, "loss": 0.0388, "num_input_tokens_seen": 37550656, "step": 37320 }, { "epoch": 17.597831211692597, "grad_norm": 0.7563056349754333, "learning_rate": 2.1660517220021742e-06, "loss": 0.0322, "num_input_tokens_seen": 37555360, "step": 37325 }, { "epoch": 17.6001885902876, "grad_norm": 0.05494888499379158, "learning_rate": 2.1618656301667122e-06, "loss": 0.1444, "num_input_tokens_seen": 37561408, "step": 37330 }, { "epoch": 17.602545968882602, "grad_norm": 1.9537746906280518, "learning_rate": 2.157683404439631e-06, "loss": 0.1766, "num_input_tokens_seen": 37566912, "step": 37335 }, { "epoch": 17.604903347477606, "grad_norm": 0.6454606056213379, "learning_rate": 2.1535050455289143e-06, "loss": 0.151, "num_input_tokens_seen": 37571040, "step": 37340 }, { "epoch": 17.607260726072607, "grad_norm": 1.7339797019958496, "learning_rate": 2.1493305541418733e-06, "loss": 0.0976, "num_input_tokens_seen": 37576096, "step": 37345 }, { "epoch": 17.60961810466761, "grad_norm": 0.08012327551841736, "learning_rate": 2.145159930985191e-06, "loss": 0.1015, "num_input_tokens_seen": 37580640, "step": 37350 }, { "epoch": 17.61197548326261, "grad_norm": 1.4648287296295166, "learning_rate": 2.14099317676488e-06, "loss": 0.131, "num_input_tokens_seen": 37585440, "step": 37355 }, { "epoch": 17.614332861857616, "grad_norm": 0.05747128650546074, "learning_rate": 2.1368302921862983e-06, "loss": 0.0431, "num_input_tokens_seen": 37590304, "step": 37360 }, { "epoch": 17.616690240452616, "grad_norm": 0.02391158416867256, "learning_rate": 2.132671277954154e-06, "loss": 0.1505, "num_input_tokens_seen": 37596576, "step": 37365 }, { "epoch": 17.61904761904762, "grad_norm": 0.9857409596443176, "learning_rate": 2.1285161347724956e-06, "loss": 0.1697, "num_input_tokens_seen": 37601024, "step": 37370 }, { "epoch": 17.62140499764262, "grad_norm": 0.7688267230987549, "learning_rate": 2.1243648633447217e-06, "loss": 0.2597, "num_input_tokens_seen": 37605888, "step": 37375 }, { "epoch": 17.623762376237625, "grad_norm": 0.05544940382242203, "learning_rate": 2.120217464373575e-06, "loss": 0.022, "num_input_tokens_seen": 37611424, "step": 37380 }, { "epoch": 17.626119754832626, "grad_norm": 0.6602968573570251, "learning_rate": 2.1160739385611306e-06, "loss": 0.0643, "num_input_tokens_seen": 37616544, "step": 37385 }, { "epoch": 17.62847713342763, "grad_norm": 0.30203378200531006, "learning_rate": 2.1119342866088186e-06, "loss": 0.2324, "num_input_tokens_seen": 37621120, "step": 37390 }, { "epoch": 17.63083451202263, "grad_norm": 0.9403830766677856, "learning_rate": 2.107798509217418e-06, "loss": 0.0968, "num_input_tokens_seen": 37625440, "step": 37395 }, { "epoch": 17.633191890617635, "grad_norm": 1.879522442817688, "learning_rate": 2.1036666070870402e-06, "loss": 0.2095, "num_input_tokens_seen": 37631424, "step": 37400 }, { "epoch": 17.635549269212635, "grad_norm": 0.051519379019737244, "learning_rate": 2.099538580917149e-06, "loss": 0.0892, "num_input_tokens_seen": 37636704, "step": 37405 }, { "epoch": 17.63790664780764, "grad_norm": 3.820772409439087, "learning_rate": 2.0954144314065506e-06, "loss": 0.1593, "num_input_tokens_seen": 37641568, "step": 37410 }, { "epoch": 17.64026402640264, "grad_norm": 0.021175434812903404, "learning_rate": 2.0912941592533877e-06, "loss": 0.1159, "num_input_tokens_seen": 37646464, "step": 37415 }, { "epoch": 17.64262140499764, "grad_norm": 0.11770468205213547, "learning_rate": 2.0871777651551627e-06, "loss": 0.1056, "num_input_tokens_seen": 37650912, "step": 37420 }, { "epoch": 17.644978783592645, "grad_norm": 1.7971739768981934, "learning_rate": 2.0830652498086993e-06, "loss": 0.1252, "num_input_tokens_seen": 37655232, "step": 37425 }, { "epoch": 17.647336162187646, "grad_norm": 2.116180658340454, "learning_rate": 2.0789566139101796e-06, "loss": 0.1623, "num_input_tokens_seen": 37660608, "step": 37430 }, { "epoch": 17.64969354078265, "grad_norm": 1.4253642559051514, "learning_rate": 2.07485185815513e-06, "loss": 0.1365, "num_input_tokens_seen": 37665760, "step": 37435 }, { "epoch": 17.65205091937765, "grad_norm": 0.8949353098869324, "learning_rate": 2.0707509832384147e-06, "loss": 0.2185, "num_input_tokens_seen": 37669952, "step": 37440 }, { "epoch": 17.654408297972655, "grad_norm": 0.2972290515899658, "learning_rate": 2.066653989854242e-06, "loss": 0.1053, "num_input_tokens_seen": 37674144, "step": 37445 }, { "epoch": 17.656765676567655, "grad_norm": 0.4771440923213959, "learning_rate": 2.0625608786961596e-06, "loss": 0.1525, "num_input_tokens_seen": 37679264, "step": 37450 }, { "epoch": 17.65912305516266, "grad_norm": 0.6558402180671692, "learning_rate": 2.0584716504570667e-06, "loss": 0.0401, "num_input_tokens_seen": 37684288, "step": 37455 }, { "epoch": 17.66148043375766, "grad_norm": 1.3611875772476196, "learning_rate": 2.0543863058292033e-06, "loss": 0.085, "num_input_tokens_seen": 37688704, "step": 37460 }, { "epoch": 17.663837812352664, "grad_norm": 0.017496002838015556, "learning_rate": 2.050304845504139e-06, "loss": 0.0589, "num_input_tokens_seen": 37693696, "step": 37465 }, { "epoch": 17.666195190947665, "grad_norm": 0.0053619989193975925, "learning_rate": 2.0462272701728015e-06, "loss": 0.0782, "num_input_tokens_seen": 37698784, "step": 37470 }, { "epoch": 17.66855256954267, "grad_norm": 0.9419664740562439, "learning_rate": 2.042153580525455e-06, "loss": 0.1709, "num_input_tokens_seen": 37703968, "step": 37475 }, { "epoch": 17.67090994813767, "grad_norm": 0.1017722487449646, "learning_rate": 2.038083777251704e-06, "loss": 0.1283, "num_input_tokens_seen": 37709216, "step": 37480 }, { "epoch": 17.673267326732674, "grad_norm": 0.09665398299694061, "learning_rate": 2.0340178610404965e-06, "loss": 0.1044, "num_input_tokens_seen": 37713376, "step": 37485 }, { "epoch": 17.675624705327674, "grad_norm": 1.6289100646972656, "learning_rate": 2.0299558325801273e-06, "loss": 0.081, "num_input_tokens_seen": 37718784, "step": 37490 }, { "epoch": 17.67798208392268, "grad_norm": 0.6165511608123779, "learning_rate": 2.025897692558226e-06, "loss": 0.2035, "num_input_tokens_seen": 37723584, "step": 37495 }, { "epoch": 17.68033946251768, "grad_norm": 0.9661131501197815, "learning_rate": 2.0218434416617722e-06, "loss": 0.1117, "num_input_tokens_seen": 37728256, "step": 37500 }, { "epoch": 17.682696841112683, "grad_norm": 1.2876269817352295, "learning_rate": 2.017793080577071e-06, "loss": 0.0968, "num_input_tokens_seen": 37732544, "step": 37505 }, { "epoch": 17.685054219707684, "grad_norm": 0.4012397527694702, "learning_rate": 2.0137466099897862e-06, "loss": 0.0592, "num_input_tokens_seen": 37737536, "step": 37510 }, { "epoch": 17.68741159830269, "grad_norm": 0.10429077595472336, "learning_rate": 2.0097040305849165e-06, "loss": 0.0372, "num_input_tokens_seen": 37742080, "step": 37515 }, { "epoch": 17.68976897689769, "grad_norm": 2.1492836475372314, "learning_rate": 2.0056653430468e-06, "loss": 0.1135, "num_input_tokens_seen": 37746784, "step": 37520 }, { "epoch": 17.692126355492693, "grad_norm": 0.09643525630235672, "learning_rate": 2.001630548059122e-06, "loss": 0.0724, "num_input_tokens_seen": 37752160, "step": 37525 }, { "epoch": 17.694483734087694, "grad_norm": 0.04948874190449715, "learning_rate": 1.9975996463049013e-06, "loss": 0.1395, "num_input_tokens_seen": 37758272, "step": 37530 }, { "epoch": 17.696841112682698, "grad_norm": 0.7256779074668884, "learning_rate": 1.993572638466501e-06, "loss": 0.0766, "num_input_tokens_seen": 37762464, "step": 37535 }, { "epoch": 17.6991984912777, "grad_norm": 0.5935637354850769, "learning_rate": 1.9895495252256286e-06, "loss": 0.0563, "num_input_tokens_seen": 37768384, "step": 37540 }, { "epoch": 17.701555869872703, "grad_norm": 0.04941121116280556, "learning_rate": 1.9855303072633254e-06, "loss": 0.1166, "num_input_tokens_seen": 37773120, "step": 37545 }, { "epoch": 17.703913248467703, "grad_norm": 1.549409031867981, "learning_rate": 1.9815149852599803e-06, "loss": 0.1408, "num_input_tokens_seen": 37777952, "step": 37550 }, { "epoch": 17.706270627062707, "grad_norm": 2.0752127170562744, "learning_rate": 1.9775035598953134e-06, "loss": 0.1353, "num_input_tokens_seen": 37782880, "step": 37555 }, { "epoch": 17.708628005657708, "grad_norm": 0.384602814912796, "learning_rate": 1.9734960318483932e-06, "loss": 0.3658, "num_input_tokens_seen": 37788192, "step": 37560 }, { "epoch": 17.710985384252712, "grad_norm": 0.4709251821041107, "learning_rate": 1.969492401797626e-06, "loss": 0.1171, "num_input_tokens_seen": 37793056, "step": 37565 }, { "epoch": 17.713342762847713, "grad_norm": 0.16919167339801788, "learning_rate": 1.9654926704207627e-06, "loss": 0.1682, "num_input_tokens_seen": 37797600, "step": 37570 }, { "epoch": 17.715700141442717, "grad_norm": 0.8192665576934814, "learning_rate": 1.961496838394883e-06, "loss": 0.0568, "num_input_tokens_seen": 37802656, "step": 37575 }, { "epoch": 17.718057520037718, "grad_norm": 0.09749232232570648, "learning_rate": 1.957504906396421e-06, "loss": 0.0335, "num_input_tokens_seen": 37806848, "step": 37580 }, { "epoch": 17.720414898632722, "grad_norm": 0.11338768899440765, "learning_rate": 1.9535168751011357e-06, "loss": 0.0217, "num_input_tokens_seen": 37812224, "step": 37585 }, { "epoch": 17.722772277227723, "grad_norm": 0.07461626827716827, "learning_rate": 1.9495327451841405e-06, "loss": 0.0352, "num_input_tokens_seen": 37817440, "step": 37590 }, { "epoch": 17.725129655822727, "grad_norm": 0.4043782353401184, "learning_rate": 1.9455525173198736e-06, "loss": 0.0487, "num_input_tokens_seen": 37821344, "step": 37595 }, { "epoch": 17.727487034417727, "grad_norm": 1.8915759325027466, "learning_rate": 1.9415761921821233e-06, "loss": 0.1202, "num_input_tokens_seen": 37826816, "step": 37600 }, { "epoch": 17.72984441301273, "grad_norm": 0.22030141949653625, "learning_rate": 1.9376037704440157e-06, "loss": 0.1809, "num_input_tokens_seen": 37831616, "step": 37605 }, { "epoch": 17.732201791607732, "grad_norm": 0.36844876408576965, "learning_rate": 1.93363525277801e-06, "loss": 0.0697, "num_input_tokens_seen": 37834944, "step": 37610 }, { "epoch": 17.734559170202736, "grad_norm": 0.45387208461761475, "learning_rate": 1.9296706398559126e-06, "loss": 0.1628, "num_input_tokens_seen": 37839392, "step": 37615 }, { "epoch": 17.736916548797737, "grad_norm": 1.5228919982910156, "learning_rate": 1.9257099323488625e-06, "loss": 0.1059, "num_input_tokens_seen": 37843936, "step": 37620 }, { "epoch": 17.739273927392738, "grad_norm": 0.061715830117464066, "learning_rate": 1.921753130927345e-06, "loss": 0.1941, "num_input_tokens_seen": 37848416, "step": 37625 }, { "epoch": 17.74163130598774, "grad_norm": 1.6258116960525513, "learning_rate": 1.9178002362611776e-06, "loss": 0.2541, "num_input_tokens_seen": 37854016, "step": 37630 }, { "epoch": 17.743988684582742, "grad_norm": 1.279463529586792, "learning_rate": 1.913851249019513e-06, "loss": 0.0526, "num_input_tokens_seen": 37861216, "step": 37635 }, { "epoch": 17.746346063177747, "grad_norm": 0.15078625082969666, "learning_rate": 1.909906169870851e-06, "loss": 0.0766, "num_input_tokens_seen": 37866144, "step": 37640 }, { "epoch": 17.748703441772747, "grad_norm": 0.6790719032287598, "learning_rate": 1.9059649994830286e-06, "loss": 0.066, "num_input_tokens_seen": 37871392, "step": 37645 }, { "epoch": 17.75106082036775, "grad_norm": 0.748466432094574, "learning_rate": 1.9020277385232154e-06, "loss": 0.0828, "num_input_tokens_seen": 37876256, "step": 37650 }, { "epoch": 17.753418198962752, "grad_norm": 0.04879361391067505, "learning_rate": 1.8980943876579254e-06, "loss": 0.0211, "num_input_tokens_seen": 37881312, "step": 37655 }, { "epoch": 17.755775577557756, "grad_norm": 1.4175822734832764, "learning_rate": 1.8941649475530094e-06, "loss": 0.1645, "num_input_tokens_seen": 37887712, "step": 37660 }, { "epoch": 17.758132956152757, "grad_norm": 0.2663194239139557, "learning_rate": 1.8902394188736516e-06, "loss": 0.0466, "num_input_tokens_seen": 37892192, "step": 37665 }, { "epoch": 17.76049033474776, "grad_norm": 0.20214390754699707, "learning_rate": 1.886317802284382e-06, "loss": 0.0963, "num_input_tokens_seen": 37895904, "step": 37670 }, { "epoch": 17.76284771334276, "grad_norm": 2.083812952041626, "learning_rate": 1.882400098449058e-06, "loss": 0.1588, "num_input_tokens_seen": 37900576, "step": 37675 }, { "epoch": 17.765205091937766, "grad_norm": 0.12307260185480118, "learning_rate": 1.8784863080308828e-06, "loss": 0.0997, "num_input_tokens_seen": 37904800, "step": 37680 }, { "epoch": 17.767562470532766, "grad_norm": 0.5433928370475769, "learning_rate": 1.874576431692393e-06, "loss": 0.1365, "num_input_tokens_seen": 37909568, "step": 37685 }, { "epoch": 17.76991984912777, "grad_norm": 0.015639126300811768, "learning_rate": 1.8706704700954676e-06, "loss": 0.134, "num_input_tokens_seen": 37917248, "step": 37690 }, { "epoch": 17.77227722772277, "grad_norm": 0.3619641363620758, "learning_rate": 1.8667684239013188e-06, "loss": 0.0625, "num_input_tokens_seen": 37922464, "step": 37695 }, { "epoch": 17.774634606317775, "grad_norm": 0.17611917853355408, "learning_rate": 1.8628702937704939e-06, "loss": 0.0441, "num_input_tokens_seen": 37927040, "step": 37700 }, { "epoch": 17.776991984912776, "grad_norm": 0.07703311741352081, "learning_rate": 1.8589760803628814e-06, "loss": 0.0151, "num_input_tokens_seen": 37931296, "step": 37705 }, { "epoch": 17.77934936350778, "grad_norm": 0.9577727317810059, "learning_rate": 1.8550857843377124e-06, "loss": 0.0372, "num_input_tokens_seen": 37936160, "step": 37710 }, { "epoch": 17.78170674210278, "grad_norm": 0.33974775671958923, "learning_rate": 1.8511994063535377e-06, "loss": 0.1087, "num_input_tokens_seen": 37941248, "step": 37715 }, { "epoch": 17.784064120697785, "grad_norm": 0.24857139587402344, "learning_rate": 1.8473169470682588e-06, "loss": 0.0574, "num_input_tokens_seen": 37947936, "step": 37720 }, { "epoch": 17.786421499292786, "grad_norm": 0.04999583587050438, "learning_rate": 1.8434384071391087e-06, "loss": 0.0883, "num_input_tokens_seen": 37953216, "step": 37725 }, { "epoch": 17.78877887788779, "grad_norm": 0.5226688981056213, "learning_rate": 1.8395637872226618e-06, "loss": 0.1464, "num_input_tokens_seen": 37957984, "step": 37730 }, { "epoch": 17.79113625648279, "grad_norm": 1.2272135019302368, "learning_rate": 1.835693087974824e-06, "loss": 0.169, "num_input_tokens_seen": 37963168, "step": 37735 }, { "epoch": 17.793493635077795, "grad_norm": 0.2339906245470047, "learning_rate": 1.831826310050841e-06, "loss": 0.0469, "num_input_tokens_seen": 37968256, "step": 37740 }, { "epoch": 17.795851013672795, "grad_norm": 0.15971295535564423, "learning_rate": 1.8279634541052886e-06, "loss": 0.1449, "num_input_tokens_seen": 37975488, "step": 37745 }, { "epoch": 17.7982083922678, "grad_norm": 0.13330714404582977, "learning_rate": 1.8241045207920888e-06, "loss": 0.1121, "num_input_tokens_seen": 37980672, "step": 37750 }, { "epoch": 17.8005657708628, "grad_norm": 0.2728968560695648, "learning_rate": 1.8202495107644824e-06, "loss": 0.0889, "num_input_tokens_seen": 37984800, "step": 37755 }, { "epoch": 17.802923149457804, "grad_norm": 0.3081456422805786, "learning_rate": 1.8163984246750671e-06, "loss": 0.0474, "num_input_tokens_seen": 37989920, "step": 37760 }, { "epoch": 17.805280528052805, "grad_norm": 0.8430666923522949, "learning_rate": 1.8125512631757686e-06, "loss": 0.2985, "num_input_tokens_seen": 37995648, "step": 37765 }, { "epoch": 17.80763790664781, "grad_norm": 0.063359834253788, "learning_rate": 1.8087080269178352e-06, "loss": 0.0732, "num_input_tokens_seen": 38000992, "step": 37770 }, { "epoch": 17.80999528524281, "grad_norm": 0.07046862691640854, "learning_rate": 1.8048687165518662e-06, "loss": 0.0174, "num_input_tokens_seen": 38004960, "step": 37775 }, { "epoch": 17.812352663837814, "grad_norm": 0.09105373173952103, "learning_rate": 1.8010333327277944e-06, "loss": 0.1565, "num_input_tokens_seen": 38009344, "step": 37780 }, { "epoch": 17.814710042432814, "grad_norm": 0.07665863633155823, "learning_rate": 1.7972018760948812e-06, "loss": 0.0489, "num_input_tokens_seen": 38014880, "step": 37785 }, { "epoch": 17.81706742102782, "grad_norm": 0.4913877248764038, "learning_rate": 1.7933743473017295e-06, "loss": 0.1341, "num_input_tokens_seen": 38019104, "step": 37790 }, { "epoch": 17.81942479962282, "grad_norm": 1.2014963626861572, "learning_rate": 1.7895507469962768e-06, "loss": 0.0764, "num_input_tokens_seen": 38023712, "step": 37795 }, { "epoch": 17.821782178217823, "grad_norm": 0.19077928364276886, "learning_rate": 1.7857310758257945e-06, "loss": 0.0321, "num_input_tokens_seen": 38027904, "step": 37800 }, { "epoch": 17.824139556812824, "grad_norm": 2.243692398071289, "learning_rate": 1.7819153344368816e-06, "loss": 0.1905, "num_input_tokens_seen": 38032832, "step": 37805 }, { "epoch": 17.826496935407828, "grad_norm": 1.005416989326477, "learning_rate": 1.77810352347548e-06, "loss": 0.0842, "num_input_tokens_seen": 38037952, "step": 37810 }, { "epoch": 17.82885431400283, "grad_norm": 1.0497645139694214, "learning_rate": 1.774295643586868e-06, "loss": 0.066, "num_input_tokens_seen": 38042560, "step": 37815 }, { "epoch": 17.831211692597833, "grad_norm": 0.4936468005180359, "learning_rate": 1.7704916954156548e-06, "loss": 0.03, "num_input_tokens_seen": 38048384, "step": 37820 }, { "epoch": 17.833569071192834, "grad_norm": 0.38934558629989624, "learning_rate": 1.7666916796057808e-06, "loss": 0.032, "num_input_tokens_seen": 38053504, "step": 37825 }, { "epoch": 17.835926449787834, "grad_norm": 0.02618546225130558, "learning_rate": 1.762895596800529e-06, "loss": 0.1019, "num_input_tokens_seen": 38058592, "step": 37830 }, { "epoch": 17.83828382838284, "grad_norm": 0.8148096799850464, "learning_rate": 1.7591034476425072e-06, "loss": 0.1031, "num_input_tokens_seen": 38063264, "step": 37835 }, { "epoch": 17.84064120697784, "grad_norm": 0.14040781557559967, "learning_rate": 1.755315232773666e-06, "loss": 0.1033, "num_input_tokens_seen": 38068416, "step": 37840 }, { "epoch": 17.842998585572843, "grad_norm": 0.16088739037513733, "learning_rate": 1.7515309528352813e-06, "loss": 0.069, "num_input_tokens_seen": 38073376, "step": 37845 }, { "epoch": 17.845355964167844, "grad_norm": 0.25033843517303467, "learning_rate": 1.7477506084679713e-06, "loss": 0.1191, "num_input_tokens_seen": 38078944, "step": 37850 }, { "epoch": 17.847713342762848, "grad_norm": 0.07476099580526352, "learning_rate": 1.743974200311682e-06, "loss": 0.2296, "num_input_tokens_seen": 38083648, "step": 37855 }, { "epoch": 17.85007072135785, "grad_norm": 0.9205811619758606, "learning_rate": 1.7402017290056943e-06, "loss": 0.0471, "num_input_tokens_seen": 38089184, "step": 37860 }, { "epoch": 17.852428099952853, "grad_norm": 0.020877551287412643, "learning_rate": 1.7364331951886247e-06, "loss": 0.0616, "num_input_tokens_seen": 38093760, "step": 37865 }, { "epoch": 17.854785478547853, "grad_norm": 0.7613266706466675, "learning_rate": 1.7326685994984243e-06, "loss": 0.2448, "num_input_tokens_seen": 38098848, "step": 37870 }, { "epoch": 17.857142857142858, "grad_norm": 1.320542812347412, "learning_rate": 1.728907942572372e-06, "loss": 0.1608, "num_input_tokens_seen": 38103904, "step": 37875 }, { "epoch": 17.85950023573786, "grad_norm": 0.10070732235908508, "learning_rate": 1.7251512250470891e-06, "loss": 0.2126, "num_input_tokens_seen": 38108320, "step": 37880 }, { "epoch": 17.861857614332862, "grad_norm": 1.8503261804580688, "learning_rate": 1.7213984475585144e-06, "loss": 0.2316, "num_input_tokens_seen": 38113728, "step": 37885 }, { "epoch": 17.864214992927863, "grad_norm": 0.08823585510253906, "learning_rate": 1.7176496107419338e-06, "loss": 0.0389, "num_input_tokens_seen": 38119360, "step": 37890 }, { "epoch": 17.866572371522867, "grad_norm": 0.19626373052597046, "learning_rate": 1.713904715231962e-06, "loss": 0.0533, "num_input_tokens_seen": 38125024, "step": 37895 }, { "epoch": 17.868929750117868, "grad_norm": 0.25473955273628235, "learning_rate": 1.7101637616625503e-06, "loss": 0.1163, "num_input_tokens_seen": 38128992, "step": 37900 }, { "epoch": 17.871287128712872, "grad_norm": 1.0499296188354492, "learning_rate": 1.7064267506669702e-06, "loss": 0.1953, "num_input_tokens_seen": 38132928, "step": 37905 }, { "epoch": 17.873644507307873, "grad_norm": 0.011197417974472046, "learning_rate": 1.702693682877843e-06, "loss": 0.0468, "num_input_tokens_seen": 38136896, "step": 37910 }, { "epoch": 17.876001885902877, "grad_norm": 0.16101256012916565, "learning_rate": 1.6989645589271076e-06, "loss": 0.0835, "num_input_tokens_seen": 38141792, "step": 37915 }, { "epoch": 17.878359264497877, "grad_norm": 0.25889384746551514, "learning_rate": 1.6952393794460453e-06, "loss": 0.0565, "num_input_tokens_seen": 38146496, "step": 37920 }, { "epoch": 17.88071664309288, "grad_norm": 1.5580331087112427, "learning_rate": 1.691518145065263e-06, "loss": 0.13, "num_input_tokens_seen": 38151136, "step": 37925 }, { "epoch": 17.883074021687882, "grad_norm": 0.025974510237574577, "learning_rate": 1.6878008564147007e-06, "loss": 0.0353, "num_input_tokens_seen": 38155968, "step": 37930 }, { "epoch": 17.885431400282886, "grad_norm": 0.313276082277298, "learning_rate": 1.6840875141236362e-06, "loss": 0.0204, "num_input_tokens_seen": 38161088, "step": 37935 }, { "epoch": 17.887788778877887, "grad_norm": 0.02667313627898693, "learning_rate": 1.6803781188206746e-06, "loss": 0.1929, "num_input_tokens_seen": 38167360, "step": 37940 }, { "epoch": 17.89014615747289, "grad_norm": 0.5515124797821045, "learning_rate": 1.6766726711337527e-06, "loss": 0.0819, "num_input_tokens_seen": 38173088, "step": 37945 }, { "epoch": 17.892503536067892, "grad_norm": 0.6000121235847473, "learning_rate": 1.6729711716901381e-06, "loss": 0.0983, "num_input_tokens_seen": 38178336, "step": 37950 }, { "epoch": 17.894860914662896, "grad_norm": 0.1515359878540039, "learning_rate": 1.669273621116435e-06, "loss": 0.0664, "num_input_tokens_seen": 38182816, "step": 37955 }, { "epoch": 17.897218293257897, "grad_norm": 0.4118606150150299, "learning_rate": 1.665580020038579e-06, "loss": 0.0208, "num_input_tokens_seen": 38187936, "step": 37960 }, { "epoch": 17.8995756718529, "grad_norm": 0.11537894606590271, "learning_rate": 1.6618903690818255e-06, "loss": 0.074, "num_input_tokens_seen": 38192224, "step": 37965 }, { "epoch": 17.9019330504479, "grad_norm": 1.5259660482406616, "learning_rate": 1.658204668870772e-06, "loss": 0.0975, "num_input_tokens_seen": 38196032, "step": 37970 }, { "epoch": 17.904290429042906, "grad_norm": 0.13105161488056183, "learning_rate": 1.65452292002935e-06, "loss": 0.1836, "num_input_tokens_seen": 38201824, "step": 37975 }, { "epoch": 17.906647807637906, "grad_norm": 0.01152564026415348, "learning_rate": 1.6508451231808109e-06, "loss": 0.0093, "num_input_tokens_seen": 38206912, "step": 37980 }, { "epoch": 17.90900518623291, "grad_norm": 1.1748642921447754, "learning_rate": 1.6471712789477484e-06, "loss": 0.0929, "num_input_tokens_seen": 38211968, "step": 37985 }, { "epoch": 17.91136256482791, "grad_norm": 0.194004088640213, "learning_rate": 1.6435013879520844e-06, "loss": 0.1243, "num_input_tokens_seen": 38218624, "step": 37990 }, { "epoch": 17.913719943422915, "grad_norm": 0.2925615906715393, "learning_rate": 1.639835450815061e-06, "loss": 0.0255, "num_input_tokens_seen": 38223104, "step": 37995 }, { "epoch": 17.916077322017916, "grad_norm": 0.06378606706857681, "learning_rate": 1.6361734681572593e-06, "loss": 0.0902, "num_input_tokens_seen": 38227456, "step": 38000 }, { "epoch": 17.91843470061292, "grad_norm": 0.09834448248147964, "learning_rate": 1.6325154405985977e-06, "loss": 0.0694, "num_input_tokens_seen": 38231968, "step": 38005 }, { "epoch": 17.92079207920792, "grad_norm": 0.334271639585495, "learning_rate": 1.6288613687583164e-06, "loss": 0.1082, "num_input_tokens_seen": 38236992, "step": 38010 }, { "epoch": 17.92314945780292, "grad_norm": 0.01613500900566578, "learning_rate": 1.625211253254985e-06, "loss": 0.0443, "num_input_tokens_seen": 38243072, "step": 38015 }, { "epoch": 17.925506836397926, "grad_norm": 0.03817380964756012, "learning_rate": 1.6215650947065037e-06, "loss": 0.0527, "num_input_tokens_seen": 38247808, "step": 38020 }, { "epoch": 17.927864214992926, "grad_norm": 1.5816646814346313, "learning_rate": 1.6179228937301116e-06, "loss": 0.3327, "num_input_tokens_seen": 38254656, "step": 38025 }, { "epoch": 17.93022159358793, "grad_norm": 4.419330596923828, "learning_rate": 1.6142846509423658e-06, "loss": 0.1888, "num_input_tokens_seen": 38259232, "step": 38030 }, { "epoch": 17.93257897218293, "grad_norm": 0.1487259566783905, "learning_rate": 1.6106503669591655e-06, "loss": 0.0187, "num_input_tokens_seen": 38264128, "step": 38035 }, { "epoch": 17.934936350777935, "grad_norm": 1.0836246013641357, "learning_rate": 1.6070200423957265e-06, "loss": 0.052, "num_input_tokens_seen": 38269312, "step": 38040 }, { "epoch": 17.937293729372936, "grad_norm": 0.11459558457136154, "learning_rate": 1.6033936778666047e-06, "loss": 0.1065, "num_input_tokens_seen": 38274144, "step": 38045 }, { "epoch": 17.93965110796794, "grad_norm": 2.3813254833221436, "learning_rate": 1.599771273985684e-06, "loss": 0.1398, "num_input_tokens_seen": 38278976, "step": 38050 }, { "epoch": 17.94200848656294, "grad_norm": 0.19292382895946503, "learning_rate": 1.596152831366171e-06, "loss": 0.1514, "num_input_tokens_seen": 38284192, "step": 38055 }, { "epoch": 17.944365865157945, "grad_norm": 1.0842889547348022, "learning_rate": 1.5925383506206094e-06, "loss": 0.267, "num_input_tokens_seen": 38289696, "step": 38060 }, { "epoch": 17.946723243752945, "grad_norm": 0.1994837373495102, "learning_rate": 1.5889278323608702e-06, "loss": 0.2702, "num_input_tokens_seen": 38293792, "step": 38065 }, { "epoch": 17.94908062234795, "grad_norm": 1.078455924987793, "learning_rate": 1.5853212771981484e-06, "loss": 0.0434, "num_input_tokens_seen": 38299136, "step": 38070 }, { "epoch": 17.95143800094295, "grad_norm": 0.03133174777030945, "learning_rate": 1.5817186857429779e-06, "loss": 0.0257, "num_input_tokens_seen": 38304608, "step": 38075 }, { "epoch": 17.953795379537954, "grad_norm": 3.1636526584625244, "learning_rate": 1.578120058605212e-06, "loss": 0.1999, "num_input_tokens_seen": 38311200, "step": 38080 }, { "epoch": 17.956152758132955, "grad_norm": 0.0610407218337059, "learning_rate": 1.5745253963940416e-06, "loss": 0.0648, "num_input_tokens_seen": 38316928, "step": 38085 }, { "epoch": 17.95851013672796, "grad_norm": 1.205494999885559, "learning_rate": 1.5709346997179824e-06, "loss": 0.0659, "num_input_tokens_seen": 38321888, "step": 38090 }, { "epoch": 17.96086751532296, "grad_norm": 0.28479740023612976, "learning_rate": 1.5673479691848737e-06, "loss": 0.0711, "num_input_tokens_seen": 38326496, "step": 38095 }, { "epoch": 17.963224893917964, "grad_norm": 0.19146539270877838, "learning_rate": 1.5637652054018875e-06, "loss": 0.1354, "num_input_tokens_seen": 38330976, "step": 38100 }, { "epoch": 17.965582272512965, "grad_norm": 0.13048356771469116, "learning_rate": 1.560186408975528e-06, "loss": 0.0629, "num_input_tokens_seen": 38335072, "step": 38105 }, { "epoch": 17.96793965110797, "grad_norm": 0.6511064171791077, "learning_rate": 1.5566115805116243e-06, "loss": 0.111, "num_input_tokens_seen": 38339808, "step": 38110 }, { "epoch": 17.97029702970297, "grad_norm": 0.04781750589609146, "learning_rate": 1.5530407206153342e-06, "loss": 0.0931, "num_input_tokens_seen": 38345696, "step": 38115 }, { "epoch": 17.972654408297974, "grad_norm": 0.2961576282978058, "learning_rate": 1.5494738298911432e-06, "loss": 0.0555, "num_input_tokens_seen": 38349632, "step": 38120 }, { "epoch": 17.975011786892974, "grad_norm": 2.0500893592834473, "learning_rate": 1.545910908942866e-06, "loss": 0.1683, "num_input_tokens_seen": 38354592, "step": 38125 }, { "epoch": 17.97736916548798, "grad_norm": 0.9742678999900818, "learning_rate": 1.5423519583736472e-06, "loss": 0.0512, "num_input_tokens_seen": 38359552, "step": 38130 }, { "epoch": 17.97972654408298, "grad_norm": 1.182594656944275, "learning_rate": 1.5387969787859503e-06, "loss": 0.1685, "num_input_tokens_seen": 38363872, "step": 38135 }, { "epoch": 17.982083922677983, "grad_norm": 0.7310320734977722, "learning_rate": 1.5352459707815764e-06, "loss": 0.0553, "num_input_tokens_seen": 38369760, "step": 38140 }, { "epoch": 17.984441301272984, "grad_norm": 1.1794989109039307, "learning_rate": 1.5316989349616507e-06, "loss": 0.1845, "num_input_tokens_seen": 38375168, "step": 38145 }, { "epoch": 17.986798679867988, "grad_norm": 0.3399054706096649, "learning_rate": 1.5281558719266258e-06, "loss": 0.0623, "num_input_tokens_seen": 38379936, "step": 38150 }, { "epoch": 17.98915605846299, "grad_norm": 0.41412603855133057, "learning_rate": 1.5246167822762808e-06, "loss": 0.0628, "num_input_tokens_seen": 38385056, "step": 38155 }, { "epoch": 17.991513437057993, "grad_norm": 0.9744095802307129, "learning_rate": 1.5210816666097277e-06, "loss": 0.2301, "num_input_tokens_seen": 38390048, "step": 38160 }, { "epoch": 17.993870815652993, "grad_norm": 0.03493468835949898, "learning_rate": 1.5175505255253992e-06, "loss": 0.0948, "num_input_tokens_seen": 38395136, "step": 38165 }, { "epoch": 17.996228194247998, "grad_norm": 0.5169470310211182, "learning_rate": 1.5140233596210612e-06, "loss": 0.064, "num_input_tokens_seen": 38399520, "step": 38170 }, { "epoch": 17.998585572842998, "grad_norm": 0.7641888856887817, "learning_rate": 1.5105001694937949e-06, "loss": 0.1512, "num_input_tokens_seen": 38405632, "step": 38175 }, { "epoch": 18.0, "eval_loss": 0.15290038287639618, "eval_runtime": 15.1865, "eval_samples_per_second": 62.095, "eval_steps_per_second": 15.54, "num_input_tokens_seen": 38408800, "step": 38178 }, { "epoch": 18.000942951438002, "grad_norm": 0.02976931445300579, "learning_rate": 1.50698095574002e-06, "loss": 0.1137, "num_input_tokens_seen": 38410912, "step": 38180 }, { "epoch": 18.003300330033003, "grad_norm": 0.24781011044979095, "learning_rate": 1.5034657189554824e-06, "loss": 0.0959, "num_input_tokens_seen": 38418112, "step": 38185 }, { "epoch": 18.005657708628007, "grad_norm": 0.43411386013031006, "learning_rate": 1.4999544597352505e-06, "loss": 0.0534, "num_input_tokens_seen": 38423040, "step": 38190 }, { "epoch": 18.008015087223008, "grad_norm": 0.19041170179843903, "learning_rate": 1.4964471786737184e-06, "loss": 0.0248, "num_input_tokens_seen": 38427520, "step": 38195 }, { "epoch": 18.010372465818012, "grad_norm": 1.0179342031478882, "learning_rate": 1.492943876364611e-06, "loss": 0.0336, "num_input_tokens_seen": 38432032, "step": 38200 }, { "epoch": 18.012729844413013, "grad_norm": 0.12969887256622314, "learning_rate": 1.4894445534009816e-06, "loss": 0.099, "num_input_tokens_seen": 38437504, "step": 38205 }, { "epoch": 18.015087223008017, "grad_norm": 1.1152628660202026, "learning_rate": 1.4859492103752037e-06, "loss": 0.085, "num_input_tokens_seen": 38442240, "step": 38210 }, { "epoch": 18.017444601603017, "grad_norm": 0.5054903030395508, "learning_rate": 1.4824578478789763e-06, "loss": 0.0383, "num_input_tokens_seen": 38447232, "step": 38215 }, { "epoch": 18.019801980198018, "grad_norm": 1.5591028928756714, "learning_rate": 1.4789704665033321e-06, "loss": 0.1113, "num_input_tokens_seen": 38451936, "step": 38220 }, { "epoch": 18.022159358793022, "grad_norm": 0.24887223541736603, "learning_rate": 1.475487066838621e-06, "loss": 0.0824, "num_input_tokens_seen": 38456896, "step": 38225 }, { "epoch": 18.024516737388023, "grad_norm": 0.3102187216281891, "learning_rate": 1.4720076494745243e-06, "loss": 0.2471, "num_input_tokens_seen": 38461568, "step": 38230 }, { "epoch": 18.026874115983027, "grad_norm": 0.01168092805892229, "learning_rate": 1.468532215000054e-06, "loss": 0.1113, "num_input_tokens_seen": 38466080, "step": 38235 }, { "epoch": 18.029231494578028, "grad_norm": 1.2866170406341553, "learning_rate": 1.4650607640035342e-06, "loss": 0.1319, "num_input_tokens_seen": 38471296, "step": 38240 }, { "epoch": 18.031588873173032, "grad_norm": 0.31787434220314026, "learning_rate": 1.4615932970726276e-06, "loss": 0.0225, "num_input_tokens_seen": 38476608, "step": 38245 }, { "epoch": 18.033946251768032, "grad_norm": 0.5211480259895325, "learning_rate": 1.4581298147943179e-06, "loss": 0.2479, "num_input_tokens_seen": 38481888, "step": 38250 }, { "epoch": 18.036303630363037, "grad_norm": 0.10647409409284592, "learning_rate": 1.454670317754911e-06, "loss": 0.052, "num_input_tokens_seen": 38486560, "step": 38255 }, { "epoch": 18.038661008958037, "grad_norm": 0.9072138071060181, "learning_rate": 1.4512148065400465e-06, "loss": 0.0541, "num_input_tokens_seen": 38491456, "step": 38260 }, { "epoch": 18.04101838755304, "grad_norm": 1.6417008638381958, "learning_rate": 1.4477632817346764e-06, "loss": 0.1157, "num_input_tokens_seen": 38496992, "step": 38265 }, { "epoch": 18.043375766148042, "grad_norm": 1.5455669164657593, "learning_rate": 1.4443157439230858e-06, "loss": 0.3084, "num_input_tokens_seen": 38501728, "step": 38270 }, { "epoch": 18.045733144743046, "grad_norm": 0.07652879506349564, "learning_rate": 1.4408721936888913e-06, "loss": 0.0447, "num_input_tokens_seen": 38506272, "step": 38275 }, { "epoch": 18.048090523338047, "grad_norm": 0.9483412504196167, "learning_rate": 1.4374326316150182e-06, "loss": 0.0947, "num_input_tokens_seen": 38512000, "step": 38280 }, { "epoch": 18.05044790193305, "grad_norm": 1.2972936630249023, "learning_rate": 1.4339970582837343e-06, "loss": 0.1303, "num_input_tokens_seen": 38517344, "step": 38285 }, { "epoch": 18.05280528052805, "grad_norm": 0.9887511134147644, "learning_rate": 1.430565474276621e-06, "loss": 0.1124, "num_input_tokens_seen": 38521440, "step": 38290 }, { "epoch": 18.055162659123056, "grad_norm": 1.6805179119110107, "learning_rate": 1.4271378801745867e-06, "loss": 0.1681, "num_input_tokens_seen": 38525568, "step": 38295 }, { "epoch": 18.057520037718056, "grad_norm": 0.31361550092697144, "learning_rate": 1.4237142765578664e-06, "loss": 0.0516, "num_input_tokens_seen": 38530336, "step": 38300 }, { "epoch": 18.05987741631306, "grad_norm": 0.34101593494415283, "learning_rate": 1.4202946640060138e-06, "loss": 0.222, "num_input_tokens_seen": 38535264, "step": 38305 }, { "epoch": 18.06223479490806, "grad_norm": 0.6898139715194702, "learning_rate": 1.4168790430979128e-06, "loss": 0.0651, "num_input_tokens_seen": 38540064, "step": 38310 }, { "epoch": 18.064592173503065, "grad_norm": 0.10133841633796692, "learning_rate": 1.4134674144117733e-06, "loss": 0.0242, "num_input_tokens_seen": 38546976, "step": 38315 }, { "epoch": 18.066949552098066, "grad_norm": 0.22497354447841644, "learning_rate": 1.4100597785251217e-06, "loss": 0.063, "num_input_tokens_seen": 38552128, "step": 38320 }, { "epoch": 18.06930693069307, "grad_norm": 0.5671169757843018, "learning_rate": 1.4066561360148167e-06, "loss": 0.0469, "num_input_tokens_seen": 38557088, "step": 38325 }, { "epoch": 18.07166430928807, "grad_norm": 0.752130389213562, "learning_rate": 1.4032564874570359e-06, "loss": 0.0322, "num_input_tokens_seen": 38562112, "step": 38330 }, { "epoch": 18.074021687883075, "grad_norm": 0.13095377385616302, "learning_rate": 1.3998608334272829e-06, "loss": 0.1247, "num_input_tokens_seen": 38566112, "step": 38335 }, { "epoch": 18.076379066478076, "grad_norm": 0.7244886755943298, "learning_rate": 1.3964691745003867e-06, "loss": 0.0259, "num_input_tokens_seen": 38572000, "step": 38340 }, { "epoch": 18.07873644507308, "grad_norm": 0.5033345222473145, "learning_rate": 1.393081511250488e-06, "loss": 0.2143, "num_input_tokens_seen": 38577024, "step": 38345 }, { "epoch": 18.08109382366808, "grad_norm": 0.3108872175216675, "learning_rate": 1.3896978442510672e-06, "loss": 0.0201, "num_input_tokens_seen": 38581152, "step": 38350 }, { "epoch": 18.083451202263085, "grad_norm": 0.3198321759700775, "learning_rate": 1.3863181740749243e-06, "loss": 0.1473, "num_input_tokens_seen": 38585952, "step": 38355 }, { "epoch": 18.085808580858085, "grad_norm": 0.020824113860726357, "learning_rate": 1.3829425012941737e-06, "loss": 0.0402, "num_input_tokens_seen": 38590528, "step": 38360 }, { "epoch": 18.08816595945309, "grad_norm": 0.12133799493312836, "learning_rate": 1.3795708264802642e-06, "loss": 0.0429, "num_input_tokens_seen": 38594656, "step": 38365 }, { "epoch": 18.09052333804809, "grad_norm": 0.29381248354911804, "learning_rate": 1.3762031502039613e-06, "loss": 0.0563, "num_input_tokens_seen": 38599712, "step": 38370 }, { "epoch": 18.092880716643094, "grad_norm": 0.7548164129257202, "learning_rate": 1.3728394730353562e-06, "loss": 0.2606, "num_input_tokens_seen": 38604832, "step": 38375 }, { "epoch": 18.095238095238095, "grad_norm": 1.283810019493103, "learning_rate": 1.3694797955438655e-06, "loss": 0.0848, "num_input_tokens_seen": 38609088, "step": 38380 }, { "epoch": 18.0975954738331, "grad_norm": 0.04845532029867172, "learning_rate": 1.3661241182982176e-06, "loss": 0.1127, "num_input_tokens_seen": 38613824, "step": 38385 }, { "epoch": 18.0999528524281, "grad_norm": 0.49194565415382385, "learning_rate": 1.3627724418664778e-06, "loss": 0.1049, "num_input_tokens_seen": 38618752, "step": 38390 }, { "epoch": 18.102310231023104, "grad_norm": 0.5088522434234619, "learning_rate": 1.3594247668160253e-06, "loss": 0.1734, "num_input_tokens_seen": 38624000, "step": 38395 }, { "epoch": 18.104667609618105, "grad_norm": 0.15555661916732788, "learning_rate": 1.3560810937135655e-06, "loss": 0.0639, "num_input_tokens_seen": 38628256, "step": 38400 }, { "epoch": 18.10702498821311, "grad_norm": 0.6759694218635559, "learning_rate": 1.3527414231251256e-06, "loss": 0.2052, "num_input_tokens_seen": 38633504, "step": 38405 }, { "epoch": 18.10938236680811, "grad_norm": 0.104024738073349, "learning_rate": 1.3494057556160566e-06, "loss": 0.1238, "num_input_tokens_seen": 38637728, "step": 38410 }, { "epoch": 18.111739745403113, "grad_norm": 0.12179803103208542, "learning_rate": 1.346074091751029e-06, "loss": 0.0541, "num_input_tokens_seen": 38642624, "step": 38415 }, { "epoch": 18.114097123998114, "grad_norm": 0.18316902220249176, "learning_rate": 1.3427464320940386e-06, "loss": 0.0856, "num_input_tokens_seen": 38647040, "step": 38420 }, { "epoch": 18.116454502593115, "grad_norm": 2.4632408618927, "learning_rate": 1.3394227772083961e-06, "loss": 0.0855, "num_input_tokens_seen": 38650784, "step": 38425 }, { "epoch": 18.11881188118812, "grad_norm": 3.4536187648773193, "learning_rate": 1.336103127656746e-06, "loss": 0.1547, "num_input_tokens_seen": 38656224, "step": 38430 }, { "epoch": 18.12116925978312, "grad_norm": 0.1647665649652481, "learning_rate": 1.332787484001044e-06, "loss": 0.0317, "num_input_tokens_seen": 38661952, "step": 38435 }, { "epoch": 18.123526638378124, "grad_norm": 0.8083735108375549, "learning_rate": 1.3294758468025802e-06, "loss": 0.148, "num_input_tokens_seen": 38666816, "step": 38440 }, { "epoch": 18.125884016973124, "grad_norm": 0.04715798422694206, "learning_rate": 1.326168216621948e-06, "loss": 0.0567, "num_input_tokens_seen": 38670720, "step": 38445 }, { "epoch": 18.12824139556813, "grad_norm": 0.21050602197647095, "learning_rate": 1.322864594019077e-06, "loss": 0.0456, "num_input_tokens_seen": 38675168, "step": 38450 }, { "epoch": 18.13059877416313, "grad_norm": 0.21660563349723816, "learning_rate": 1.3195649795532173e-06, "loss": 0.1743, "num_input_tokens_seen": 38680512, "step": 38455 }, { "epoch": 18.132956152758133, "grad_norm": 0.6098459362983704, "learning_rate": 1.3162693737829356e-06, "loss": 0.0605, "num_input_tokens_seen": 38685248, "step": 38460 }, { "epoch": 18.135313531353134, "grad_norm": 0.7133105397224426, "learning_rate": 1.312977777266125e-06, "loss": 0.1778, "num_input_tokens_seen": 38691136, "step": 38465 }, { "epoch": 18.137670909948138, "grad_norm": 1.1594274044036865, "learning_rate": 1.3096901905599895e-06, "loss": 0.0664, "num_input_tokens_seen": 38695872, "step": 38470 }, { "epoch": 18.14002828854314, "grad_norm": 0.39767447113990784, "learning_rate": 1.3064066142210673e-06, "loss": 0.0437, "num_input_tokens_seen": 38700672, "step": 38475 }, { "epoch": 18.142385667138143, "grad_norm": 0.30666640400886536, "learning_rate": 1.303127048805211e-06, "loss": 0.0862, "num_input_tokens_seen": 38706368, "step": 38480 }, { "epoch": 18.144743045733144, "grad_norm": 1.525651454925537, "learning_rate": 1.2998514948675932e-06, "loss": 0.0878, "num_input_tokens_seen": 38711360, "step": 38485 }, { "epoch": 18.147100424328148, "grad_norm": 0.01650037057697773, "learning_rate": 1.296579952962715e-06, "loss": 0.1077, "num_input_tokens_seen": 38715840, "step": 38490 }, { "epoch": 18.14945780292315, "grad_norm": 3.418607473373413, "learning_rate": 1.2933124236443884e-06, "loss": 0.2678, "num_input_tokens_seen": 38720608, "step": 38495 }, { "epoch": 18.151815181518153, "grad_norm": 1.47606360912323, "learning_rate": 1.2900489074657519e-06, "loss": 0.117, "num_input_tokens_seen": 38725440, "step": 38500 }, { "epoch": 18.154172560113153, "grad_norm": 0.3298921287059784, "learning_rate": 1.2867894049792662e-06, "loss": 0.0777, "num_input_tokens_seen": 38729376, "step": 38505 }, { "epoch": 18.156529938708157, "grad_norm": 1.7041215896606445, "learning_rate": 1.2835339167367038e-06, "loss": 0.1746, "num_input_tokens_seen": 38734816, "step": 38510 }, { "epoch": 18.158887317303158, "grad_norm": 1.6502629518508911, "learning_rate": 1.2802824432891658e-06, "loss": 0.1885, "num_input_tokens_seen": 38740800, "step": 38515 }, { "epoch": 18.161244695898162, "grad_norm": 0.1152840331196785, "learning_rate": 1.2770349851870727e-06, "loss": 0.064, "num_input_tokens_seen": 38745024, "step": 38520 }, { "epoch": 18.163602074493163, "grad_norm": 0.373716801404953, "learning_rate": 1.2737915429801628e-06, "loss": 0.0423, "num_input_tokens_seen": 38749632, "step": 38525 }, { "epoch": 18.165959453088167, "grad_norm": 1.888611078262329, "learning_rate": 1.2705521172174994e-06, "loss": 0.1186, "num_input_tokens_seen": 38754464, "step": 38530 }, { "epoch": 18.168316831683168, "grad_norm": 0.806443452835083, "learning_rate": 1.267316708447458e-06, "loss": 0.0427, "num_input_tokens_seen": 38760032, "step": 38535 }, { "epoch": 18.17067421027817, "grad_norm": 0.07209982722997665, "learning_rate": 1.2640853172177392e-06, "loss": 0.0707, "num_input_tokens_seen": 38764896, "step": 38540 }, { "epoch": 18.173031588873172, "grad_norm": 0.9517520070075989, "learning_rate": 1.2608579440753664e-06, "loss": 0.0995, "num_input_tokens_seen": 38769760, "step": 38545 }, { "epoch": 18.175388967468177, "grad_norm": 0.8647726774215698, "learning_rate": 1.2576345895666725e-06, "loss": 0.1035, "num_input_tokens_seen": 38774976, "step": 38550 }, { "epoch": 18.177746346063177, "grad_norm": 0.052909839898347855, "learning_rate": 1.2544152542373205e-06, "loss": 0.0762, "num_input_tokens_seen": 38779392, "step": 38555 }, { "epoch": 18.18010372465818, "grad_norm": 2.8576834201812744, "learning_rate": 1.2511999386322914e-06, "loss": 0.0641, "num_input_tokens_seen": 38784512, "step": 38560 }, { "epoch": 18.182461103253182, "grad_norm": 1.4187721014022827, "learning_rate": 1.2479886432958777e-06, "loss": 0.1706, "num_input_tokens_seen": 38788736, "step": 38565 }, { "epoch": 18.184818481848186, "grad_norm": 0.22836634516716003, "learning_rate": 1.2447813687717024e-06, "loss": 0.0556, "num_input_tokens_seen": 38793376, "step": 38570 }, { "epoch": 18.187175860443187, "grad_norm": 1.1756030321121216, "learning_rate": 1.2415781156026985e-06, "loss": 0.0929, "num_input_tokens_seen": 38798656, "step": 38575 }, { "epoch": 18.18953323903819, "grad_norm": 2.6823928356170654, "learning_rate": 1.238378884331126e-06, "loss": 0.0944, "num_input_tokens_seen": 38803584, "step": 38580 }, { "epoch": 18.19189061763319, "grad_norm": 2.0696005821228027, "learning_rate": 1.2351836754985636e-06, "loss": 0.1231, "num_input_tokens_seen": 38808352, "step": 38585 }, { "epoch": 18.194247996228196, "grad_norm": 0.8708615899085999, "learning_rate": 1.2319924896458973e-06, "loss": 0.0884, "num_input_tokens_seen": 38813696, "step": 38590 }, { "epoch": 18.196605374823196, "grad_norm": 0.025032762438058853, "learning_rate": 1.2288053273133426e-06, "loss": 0.1087, "num_input_tokens_seen": 38819776, "step": 38595 }, { "epoch": 18.1989627534182, "grad_norm": 0.6150924563407898, "learning_rate": 1.2256221890404374e-06, "loss": 0.0214, "num_input_tokens_seen": 38825152, "step": 38600 }, { "epoch": 18.2013201320132, "grad_norm": 0.6286703944206238, "learning_rate": 1.2224430753660282e-06, "loss": 0.0234, "num_input_tokens_seen": 38829760, "step": 38605 }, { "epoch": 18.203677510608205, "grad_norm": 0.16027005016803741, "learning_rate": 1.2192679868282848e-06, "loss": 0.0265, "num_input_tokens_seen": 38834528, "step": 38610 }, { "epoch": 18.206034889203206, "grad_norm": 0.17131900787353516, "learning_rate": 1.216096923964699e-06, "loss": 0.1471, "num_input_tokens_seen": 38839296, "step": 38615 }, { "epoch": 18.208392267798207, "grad_norm": 0.1449434906244278, "learning_rate": 1.2129298873120753e-06, "loss": 0.1467, "num_input_tokens_seen": 38844192, "step": 38620 }, { "epoch": 18.21074964639321, "grad_norm": 0.08847605437040329, "learning_rate": 1.2097668774065458e-06, "loss": 0.2066, "num_input_tokens_seen": 38849440, "step": 38625 }, { "epoch": 18.21310702498821, "grad_norm": 0.035313088446855545, "learning_rate": 1.206607894783543e-06, "loss": 0.0273, "num_input_tokens_seen": 38854432, "step": 38630 }, { "epoch": 18.215464403583216, "grad_norm": 0.9941276907920837, "learning_rate": 1.2034529399778365e-06, "loss": 0.1022, "num_input_tokens_seen": 38859744, "step": 38635 }, { "epoch": 18.217821782178216, "grad_norm": 0.1699114888906479, "learning_rate": 1.2003020135235021e-06, "loss": 0.0428, "num_input_tokens_seen": 38864672, "step": 38640 }, { "epoch": 18.22017916077322, "grad_norm": 0.5697372555732727, "learning_rate": 1.1971551159539434e-06, "loss": 0.0976, "num_input_tokens_seen": 38869760, "step": 38645 }, { "epoch": 18.22253653936822, "grad_norm": 0.5206603407859802, "learning_rate": 1.1940122478018734e-06, "loss": 0.1109, "num_input_tokens_seen": 38874592, "step": 38650 }, { "epoch": 18.224893917963225, "grad_norm": 1.176051139831543, "learning_rate": 1.1908734095993274e-06, "loss": 0.1565, "num_input_tokens_seen": 38879008, "step": 38655 }, { "epoch": 18.227251296558226, "grad_norm": 0.04710017889738083, "learning_rate": 1.187738601877658e-06, "loss": 0.0397, "num_input_tokens_seen": 38883680, "step": 38660 }, { "epoch": 18.22960867515323, "grad_norm": 2.4549853801727295, "learning_rate": 1.1846078251675386e-06, "loss": 0.1841, "num_input_tokens_seen": 38889280, "step": 38665 }, { "epoch": 18.23196605374823, "grad_norm": 0.04497886076569557, "learning_rate": 1.1814810799989474e-06, "loss": 0.0801, "num_input_tokens_seen": 38894464, "step": 38670 }, { "epoch": 18.234323432343235, "grad_norm": 1.794123649597168, "learning_rate": 1.1783583669011943e-06, "loss": 0.2402, "num_input_tokens_seen": 38899520, "step": 38675 }, { "epoch": 18.236680810938235, "grad_norm": 1.8099886178970337, "learning_rate": 1.1752396864029013e-06, "loss": 0.1279, "num_input_tokens_seen": 38904896, "step": 38680 }, { "epoch": 18.23903818953324, "grad_norm": 0.20292535424232483, "learning_rate": 1.172125039032007e-06, "loss": 0.1104, "num_input_tokens_seen": 38910048, "step": 38685 }, { "epoch": 18.24139556812824, "grad_norm": 0.6592541337013245, "learning_rate": 1.1690144253157704e-06, "loss": 0.0703, "num_input_tokens_seen": 38913952, "step": 38690 }, { "epoch": 18.243752946723244, "grad_norm": 1.5472967624664307, "learning_rate": 1.1659078457807644e-06, "loss": 0.1583, "num_input_tokens_seen": 38921216, "step": 38695 }, { "epoch": 18.246110325318245, "grad_norm": 0.25798261165618896, "learning_rate": 1.1628053009528766e-06, "loss": 0.0849, "num_input_tokens_seen": 38925696, "step": 38700 }, { "epoch": 18.24846770391325, "grad_norm": 2.3730218410491943, "learning_rate": 1.1597067913573234e-06, "loss": 0.1707, "num_input_tokens_seen": 38932256, "step": 38705 }, { "epoch": 18.25082508250825, "grad_norm": 0.13967858254909515, "learning_rate": 1.1566123175186238e-06, "loss": 0.1796, "num_input_tokens_seen": 38937248, "step": 38710 }, { "epoch": 18.253182461103254, "grad_norm": 0.16086171567440033, "learning_rate": 1.153521879960623e-06, "loss": 0.0563, "num_input_tokens_seen": 38942720, "step": 38715 }, { "epoch": 18.255539839698255, "grad_norm": 0.12828809022903442, "learning_rate": 1.150435479206477e-06, "loss": 0.1747, "num_input_tokens_seen": 38952256, "step": 38720 }, { "epoch": 18.25789721829326, "grad_norm": 0.19268667697906494, "learning_rate": 1.14735311577866e-06, "loss": 0.1345, "num_input_tokens_seen": 38956640, "step": 38725 }, { "epoch": 18.26025459688826, "grad_norm": 1.480117917060852, "learning_rate": 1.1442747901989653e-06, "loss": 0.1631, "num_input_tokens_seen": 38961088, "step": 38730 }, { "epoch": 18.262611975483264, "grad_norm": 0.09742552787065506, "learning_rate": 1.1412005029885015e-06, "loss": 0.095, "num_input_tokens_seen": 38965952, "step": 38735 }, { "epoch": 18.264969354078264, "grad_norm": 0.45506152510643005, "learning_rate": 1.1381302546676936e-06, "loss": 0.1468, "num_input_tokens_seen": 38970528, "step": 38740 }, { "epoch": 18.26732673267327, "grad_norm": 2.113184928894043, "learning_rate": 1.1350640457562844e-06, "loss": 0.2125, "num_input_tokens_seen": 38974784, "step": 38745 }, { "epoch": 18.26968411126827, "grad_norm": 0.3229585886001587, "learning_rate": 1.1320018767733254e-06, "loss": 0.0359, "num_input_tokens_seen": 38978912, "step": 38750 }, { "epoch": 18.272041489863273, "grad_norm": 0.16325847804546356, "learning_rate": 1.1289437482371989e-06, "loss": 0.0374, "num_input_tokens_seen": 38983808, "step": 38755 }, { "epoch": 18.274398868458274, "grad_norm": 0.14509399235248566, "learning_rate": 1.1258896606655827e-06, "loss": 0.1974, "num_input_tokens_seen": 38988544, "step": 38760 }, { "epoch": 18.276756247053278, "grad_norm": 0.09516989439725876, "learning_rate": 1.1228396145754882e-06, "loss": 0.1034, "num_input_tokens_seen": 38993696, "step": 38765 }, { "epoch": 18.27911362564828, "grad_norm": 3.256206512451172, "learning_rate": 1.1197936104832384e-06, "loss": 0.1283, "num_input_tokens_seen": 38998240, "step": 38770 }, { "epoch": 18.281471004243283, "grad_norm": 2.1163535118103027, "learning_rate": 1.1167516489044656e-06, "loss": 0.1337, "num_input_tokens_seen": 39004416, "step": 38775 }, { "epoch": 18.283828382838283, "grad_norm": 0.10188411176204681, "learning_rate": 1.113713730354124e-06, "loss": 0.0329, "num_input_tokens_seen": 39009440, "step": 38780 }, { "epoch": 18.286185761433288, "grad_norm": 0.33414193987846375, "learning_rate": 1.1106798553464804e-06, "loss": 0.1089, "num_input_tokens_seen": 39014016, "step": 38785 }, { "epoch": 18.28854314002829, "grad_norm": 0.2047710418701172, "learning_rate": 1.107650024395121e-06, "loss": 0.0405, "num_input_tokens_seen": 39018944, "step": 38790 }, { "epoch": 18.290900518623292, "grad_norm": 2.1444575786590576, "learning_rate": 1.1046242380129495e-06, "loss": 0.1871, "num_input_tokens_seen": 39024480, "step": 38795 }, { "epoch": 18.293257897218293, "grad_norm": 1.004206657409668, "learning_rate": 1.1016024967121674e-06, "loss": 0.3587, "num_input_tokens_seen": 39029536, "step": 38800 }, { "epoch": 18.295615275813297, "grad_norm": 0.2603381276130676, "learning_rate": 1.0985848010043126e-06, "loss": 0.0605, "num_input_tokens_seen": 39034400, "step": 38805 }, { "epoch": 18.297972654408298, "grad_norm": 0.1941567212343216, "learning_rate": 1.0955711514002265e-06, "loss": 0.2949, "num_input_tokens_seen": 39039616, "step": 38810 }, { "epoch": 18.300330033003302, "grad_norm": 1.1654853820800781, "learning_rate": 1.0925615484100704e-06, "loss": 0.1915, "num_input_tokens_seen": 39044384, "step": 38815 }, { "epoch": 18.302687411598303, "grad_norm": 0.04380359500646591, "learning_rate": 1.0895559925433201e-06, "loss": 0.0663, "num_input_tokens_seen": 39049824, "step": 38820 }, { "epoch": 18.305044790193303, "grad_norm": 0.18247754871845245, "learning_rate": 1.0865544843087634e-06, "loss": 0.0252, "num_input_tokens_seen": 39054240, "step": 38825 }, { "epoch": 18.307402168788308, "grad_norm": 1.386003017425537, "learning_rate": 1.0835570242145071e-06, "loss": 0.1493, "num_input_tokens_seen": 39059168, "step": 38830 }, { "epoch": 18.309759547383308, "grad_norm": 0.07934415340423584, "learning_rate": 1.080563612767971e-06, "loss": 0.0618, "num_input_tokens_seen": 39063296, "step": 38835 }, { "epoch": 18.312116925978312, "grad_norm": 0.17845679819583893, "learning_rate": 1.0775742504758795e-06, "loss": 0.0932, "num_input_tokens_seen": 39067968, "step": 38840 }, { "epoch": 18.314474304573313, "grad_norm": 0.01848330907523632, "learning_rate": 1.0745889378442924e-06, "loss": 0.0968, "num_input_tokens_seen": 39072480, "step": 38845 }, { "epoch": 18.316831683168317, "grad_norm": 0.9651110768318176, "learning_rate": 1.0716076753785664e-06, "loss": 0.0499, "num_input_tokens_seen": 39077696, "step": 38850 }, { "epoch": 18.319189061763318, "grad_norm": 0.40376007556915283, "learning_rate": 1.0686304635833782e-06, "loss": 0.0247, "num_input_tokens_seen": 39082048, "step": 38855 }, { "epoch": 18.321546440358322, "grad_norm": 0.43587353825569153, "learning_rate": 1.0656573029627221e-06, "loss": 0.1426, "num_input_tokens_seen": 39087392, "step": 38860 }, { "epoch": 18.323903818953323, "grad_norm": 1.0535178184509277, "learning_rate": 1.0626881940199035e-06, "loss": 0.093, "num_input_tokens_seen": 39091552, "step": 38865 }, { "epoch": 18.326261197548327, "grad_norm": 1.2010245323181152, "learning_rate": 1.0597231372575401e-06, "loss": 0.12, "num_input_tokens_seen": 39096448, "step": 38870 }, { "epoch": 18.328618576143327, "grad_norm": 0.04291713982820511, "learning_rate": 1.0567621331775717e-06, "loss": 0.1747, "num_input_tokens_seen": 39100480, "step": 38875 }, { "epoch": 18.33097595473833, "grad_norm": 1.3986023664474487, "learning_rate": 1.0538051822812367e-06, "loss": 0.1812, "num_input_tokens_seen": 39105664, "step": 38880 }, { "epoch": 18.333333333333332, "grad_norm": 1.0749387741088867, "learning_rate": 1.0508522850691012e-06, "loss": 0.0949, "num_input_tokens_seen": 39111104, "step": 38885 }, { "epoch": 18.335690711928336, "grad_norm": 0.8947421908378601, "learning_rate": 1.0479034420410427e-06, "loss": 0.0876, "num_input_tokens_seen": 39115680, "step": 38890 }, { "epoch": 18.338048090523337, "grad_norm": 1.2112882137298584, "learning_rate": 1.0449586536962458e-06, "loss": 0.1097, "num_input_tokens_seen": 39120992, "step": 38895 }, { "epoch": 18.34040546911834, "grad_norm": 2.0978057384490967, "learning_rate": 1.0420179205332142e-06, "loss": 0.1196, "num_input_tokens_seen": 39127104, "step": 38900 }, { "epoch": 18.34276284771334, "grad_norm": 0.9974848031997681, "learning_rate": 1.0390812430497665e-06, "loss": 0.0424, "num_input_tokens_seen": 39132256, "step": 38905 }, { "epoch": 18.345120226308346, "grad_norm": 0.10627693682909012, "learning_rate": 1.0361486217430328e-06, "loss": 0.0496, "num_input_tokens_seen": 39137504, "step": 38910 }, { "epoch": 18.347477604903347, "grad_norm": 1.1863071918487549, "learning_rate": 1.0332200571094518e-06, "loss": 0.1184, "num_input_tokens_seen": 39142368, "step": 38915 }, { "epoch": 18.34983498349835, "grad_norm": 0.06922031193971634, "learning_rate": 1.030295549644783e-06, "loss": 0.1596, "num_input_tokens_seen": 39147072, "step": 38920 }, { "epoch": 18.35219236209335, "grad_norm": 3.0491702556610107, "learning_rate": 1.0273750998440996e-06, "loss": 0.2228, "num_input_tokens_seen": 39151680, "step": 38925 }, { "epoch": 18.354549740688356, "grad_norm": 0.6811000108718872, "learning_rate": 1.0244587082017753e-06, "loss": 0.0987, "num_input_tokens_seen": 39156032, "step": 38930 }, { "epoch": 18.356907119283356, "grad_norm": 2.551279306411743, "learning_rate": 1.0215463752115101e-06, "loss": 0.1703, "num_input_tokens_seen": 39162912, "step": 38935 }, { "epoch": 18.35926449787836, "grad_norm": 0.04033161699771881, "learning_rate": 1.018638101366312e-06, "loss": 0.1928, "num_input_tokens_seen": 39168512, "step": 38940 }, { "epoch": 18.36162187647336, "grad_norm": 0.070872463285923, "learning_rate": 1.0157338871585043e-06, "loss": 0.0606, "num_input_tokens_seen": 39173696, "step": 38945 }, { "epoch": 18.363979255068365, "grad_norm": 0.40173807740211487, "learning_rate": 1.0128337330797182e-06, "loss": 0.0343, "num_input_tokens_seen": 39177664, "step": 38950 }, { "epoch": 18.366336633663366, "grad_norm": 0.01694096066057682, "learning_rate": 1.0099376396209032e-06, "loss": 0.1743, "num_input_tokens_seen": 39182400, "step": 38955 }, { "epoch": 18.36869401225837, "grad_norm": 1.046403169631958, "learning_rate": 1.0070456072723194e-06, "loss": 0.122, "num_input_tokens_seen": 39186976, "step": 38960 }, { "epoch": 18.37105139085337, "grad_norm": 0.10117820650339127, "learning_rate": 1.0041576365235395e-06, "loss": 0.0136, "num_input_tokens_seen": 39190976, "step": 38965 }, { "epoch": 18.373408769448375, "grad_norm": 0.32803046703338623, "learning_rate": 1.0012737278634416e-06, "loss": 0.0671, "num_input_tokens_seen": 39197504, "step": 38970 }, { "epoch": 18.375766148043375, "grad_norm": 0.42066630721092224, "learning_rate": 9.983938817802268e-07, "loss": 0.0333, "num_input_tokens_seen": 39203104, "step": 38975 }, { "epoch": 18.37812352663838, "grad_norm": 0.3217920958995819, "learning_rate": 9.955180987614026e-07, "loss": 0.0674, "num_input_tokens_seen": 39209472, "step": 38980 }, { "epoch": 18.38048090523338, "grad_norm": 0.20334911346435547, "learning_rate": 9.926463792937907e-07, "loss": 0.0823, "num_input_tokens_seen": 39214464, "step": 38985 }, { "epoch": 18.382838283828384, "grad_norm": 0.05607367306947708, "learning_rate": 9.89778723863527e-07, "loss": 0.0696, "num_input_tokens_seen": 39219200, "step": 38990 }, { "epoch": 18.385195662423385, "grad_norm": 2.439225673675537, "learning_rate": 9.869151329560565e-07, "loss": 0.168, "num_input_tokens_seen": 39222912, "step": 38995 }, { "epoch": 18.38755304101839, "grad_norm": 2.051910161972046, "learning_rate": 9.840556070561335e-07, "loss": 0.0822, "num_input_tokens_seen": 39228064, "step": 39000 }, { "epoch": 18.38991041961339, "grad_norm": 0.08606290817260742, "learning_rate": 9.812001466478315e-07, "loss": 0.1048, "num_input_tokens_seen": 39233792, "step": 39005 }, { "epoch": 18.392267798208394, "grad_norm": 0.1366279125213623, "learning_rate": 9.783487522145251e-07, "loss": 0.0651, "num_input_tokens_seen": 39237568, "step": 39010 }, { "epoch": 18.394625176803395, "grad_norm": 1.239233136177063, "learning_rate": 9.755014242389115e-07, "loss": 0.2261, "num_input_tokens_seen": 39241952, "step": 39015 }, { "epoch": 18.396982555398395, "grad_norm": 0.656815767288208, "learning_rate": 9.726581632029912e-07, "loss": 0.2001, "num_input_tokens_seen": 39246240, "step": 39020 }, { "epoch": 18.3993399339934, "grad_norm": 0.1254488229751587, "learning_rate": 9.69818969588085e-07, "loss": 0.0363, "num_input_tokens_seen": 39250848, "step": 39025 }, { "epoch": 18.4016973125884, "grad_norm": 0.06123283877968788, "learning_rate": 9.669838438748192e-07, "loss": 0.184, "num_input_tokens_seen": 39256832, "step": 39030 }, { "epoch": 18.404054691183404, "grad_norm": 0.5177552700042725, "learning_rate": 9.641527865431294e-07, "loss": 0.0449, "num_input_tokens_seen": 39262304, "step": 39035 }, { "epoch": 18.406412069778405, "grad_norm": 0.07223974168300629, "learning_rate": 9.613257980722684e-07, "loss": 0.1001, "num_input_tokens_seen": 39267552, "step": 39040 }, { "epoch": 18.40876944837341, "grad_norm": 0.03617564216256142, "learning_rate": 9.585028789408006e-07, "loss": 0.1903, "num_input_tokens_seen": 39272832, "step": 39045 }, { "epoch": 18.41112682696841, "grad_norm": 0.5531351566314697, "learning_rate": 9.556840296265884e-07, "loss": 0.1533, "num_input_tokens_seen": 39277632, "step": 39050 }, { "epoch": 18.413484205563414, "grad_norm": 2.017392635345459, "learning_rate": 9.528692506068221e-07, "loss": 0.2269, "num_input_tokens_seen": 39283488, "step": 39055 }, { "epoch": 18.415841584158414, "grad_norm": 0.09836051613092422, "learning_rate": 9.500585423579955e-07, "loss": 0.1539, "num_input_tokens_seen": 39288288, "step": 39060 }, { "epoch": 18.41819896275342, "grad_norm": 1.2187832593917847, "learning_rate": 9.472519053559114e-07, "loss": 0.0909, "num_input_tokens_seen": 39294432, "step": 39065 }, { "epoch": 18.42055634134842, "grad_norm": 0.03663168102502823, "learning_rate": 9.444493400756871e-07, "loss": 0.0454, "num_input_tokens_seen": 39299840, "step": 39070 }, { "epoch": 18.422913719943423, "grad_norm": 0.22602060437202454, "learning_rate": 9.416508469917512e-07, "loss": 0.0338, "num_input_tokens_seen": 39304480, "step": 39075 }, { "epoch": 18.425271098538424, "grad_norm": 0.0706673264503479, "learning_rate": 9.38856426577836e-07, "loss": 0.0932, "num_input_tokens_seen": 39308512, "step": 39080 }, { "epoch": 18.427628477133428, "grad_norm": 0.14847320318222046, "learning_rate": 9.360660793069992e-07, "loss": 0.111, "num_input_tokens_seen": 39313088, "step": 39085 }, { "epoch": 18.42998585572843, "grad_norm": 1.4233754873275757, "learning_rate": 9.332798056515879e-07, "loss": 0.1805, "num_input_tokens_seen": 39317888, "step": 39090 }, { "epoch": 18.432343234323433, "grad_norm": 1.2984355688095093, "learning_rate": 9.304976060832777e-07, "loss": 0.4297, "num_input_tokens_seen": 39322816, "step": 39095 }, { "epoch": 18.434700612918434, "grad_norm": 0.18303917348384857, "learning_rate": 9.277194810730444e-07, "loss": 0.0357, "num_input_tokens_seen": 39328032, "step": 39100 }, { "epoch": 18.437057991513438, "grad_norm": 0.45170700550079346, "learning_rate": 9.249454310911787e-07, "loss": 0.0993, "num_input_tokens_seen": 39332544, "step": 39105 }, { "epoch": 18.43941537010844, "grad_norm": 0.06497466564178467, "learning_rate": 9.221754566072827e-07, "loss": 0.0679, "num_input_tokens_seen": 39337600, "step": 39110 }, { "epoch": 18.441772748703443, "grad_norm": 3.0966904163360596, "learning_rate": 9.194095580902645e-07, "loss": 0.0492, "num_input_tokens_seen": 39342240, "step": 39115 }, { "epoch": 18.444130127298443, "grad_norm": 1.2189955711364746, "learning_rate": 9.166477360083415e-07, "loss": 0.1008, "num_input_tokens_seen": 39347872, "step": 39120 }, { "epoch": 18.446487505893447, "grad_norm": 0.18584486842155457, "learning_rate": 9.138899908290421e-07, "loss": 0.0326, "num_input_tokens_seen": 39352864, "step": 39125 }, { "epoch": 18.448844884488448, "grad_norm": 0.4621744751930237, "learning_rate": 9.1113632301921e-07, "loss": 0.1891, "num_input_tokens_seen": 39357152, "step": 39130 }, { "epoch": 18.451202263083452, "grad_norm": 0.09931658208370209, "learning_rate": 9.083867330449969e-07, "loss": 0.0817, "num_input_tokens_seen": 39362560, "step": 39135 }, { "epoch": 18.453559641678453, "grad_norm": 0.7088258862495422, "learning_rate": 9.05641221371853e-07, "loss": 0.2691, "num_input_tokens_seen": 39367296, "step": 39140 }, { "epoch": 18.455917020273457, "grad_norm": 0.15448535978794098, "learning_rate": 9.028997884645535e-07, "loss": 0.113, "num_input_tokens_seen": 39372576, "step": 39145 }, { "epoch": 18.458274398868458, "grad_norm": 1.154727816581726, "learning_rate": 9.001624347871717e-07, "loss": 0.1895, "num_input_tokens_seen": 39378464, "step": 39150 }, { "epoch": 18.460631777463462, "grad_norm": 1.4706313610076904, "learning_rate": 8.974291608030983e-07, "loss": 0.1012, "num_input_tokens_seen": 39383392, "step": 39155 }, { "epoch": 18.462989156058462, "grad_norm": 0.2669197618961334, "learning_rate": 8.946999669750294e-07, "loss": 0.1174, "num_input_tokens_seen": 39387776, "step": 39160 }, { "epoch": 18.465346534653467, "grad_norm": 1.847612738609314, "learning_rate": 8.919748537649736e-07, "loss": 0.1123, "num_input_tokens_seen": 39392480, "step": 39165 }, { "epoch": 18.467703913248467, "grad_norm": 0.03660929575562477, "learning_rate": 8.892538216342422e-07, "loss": 0.1172, "num_input_tokens_seen": 39397344, "step": 39170 }, { "epoch": 18.47006129184347, "grad_norm": 0.4908805191516876, "learning_rate": 8.86536871043464e-07, "loss": 0.082, "num_input_tokens_seen": 39401824, "step": 39175 }, { "epoch": 18.472418670438472, "grad_norm": 0.30921053886413574, "learning_rate": 8.838240024525685e-07, "loss": 0.0195, "num_input_tokens_seen": 39406560, "step": 39180 }, { "epoch": 18.474776049033476, "grad_norm": 0.062423139810562134, "learning_rate": 8.811152163208019e-07, "loss": 0.0569, "num_input_tokens_seen": 39410912, "step": 39185 }, { "epoch": 18.477133427628477, "grad_norm": 1.7669004201889038, "learning_rate": 8.784105131067116e-07, "loss": 0.1619, "num_input_tokens_seen": 39416096, "step": 39190 }, { "epoch": 18.47949080622348, "grad_norm": 2.893559694290161, "learning_rate": 8.75709893268159e-07, "loss": 0.1104, "num_input_tokens_seen": 39421088, "step": 39195 }, { "epoch": 18.48184818481848, "grad_norm": 1.2087485790252686, "learning_rate": 8.730133572623172e-07, "loss": 0.164, "num_input_tokens_seen": 39426176, "step": 39200 }, { "epoch": 18.484205563413486, "grad_norm": 0.05494721978902817, "learning_rate": 8.703209055456573e-07, "loss": 0.0605, "num_input_tokens_seen": 39430848, "step": 39205 }, { "epoch": 18.486562942008486, "grad_norm": 0.573886513710022, "learning_rate": 8.676325385739731e-07, "loss": 0.1198, "num_input_tokens_seen": 39435264, "step": 39210 }, { "epoch": 18.48892032060349, "grad_norm": 0.8360560536384583, "learning_rate": 8.649482568023559e-07, "loss": 0.0414, "num_input_tokens_seen": 39439904, "step": 39215 }, { "epoch": 18.49127769919849, "grad_norm": 0.21555465459823608, "learning_rate": 8.622680606852091e-07, "loss": 0.0941, "num_input_tokens_seen": 39445984, "step": 39220 }, { "epoch": 18.493635077793492, "grad_norm": 0.9161751866340637, "learning_rate": 8.595919506762418e-07, "loss": 0.1217, "num_input_tokens_seen": 39451360, "step": 39225 }, { "epoch": 18.495992456388496, "grad_norm": 0.21439407765865326, "learning_rate": 8.569199272284778e-07, "loss": 0.0621, "num_input_tokens_seen": 39456256, "step": 39230 }, { "epoch": 18.498349834983497, "grad_norm": 0.0905635803937912, "learning_rate": 8.542519907942442e-07, "loss": 0.0504, "num_input_tokens_seen": 39461472, "step": 39235 }, { "epoch": 18.5007072135785, "grad_norm": 0.09298818558454514, "learning_rate": 8.51588141825177e-07, "loss": 0.1038, "num_input_tokens_seen": 39466784, "step": 39240 }, { "epoch": 18.5030645921735, "grad_norm": 2.5251519680023193, "learning_rate": 8.489283807722182e-07, "loss": 0.078, "num_input_tokens_seen": 39474560, "step": 39245 }, { "epoch": 18.505421970768506, "grad_norm": 1.3580708503723145, "learning_rate": 8.462727080856242e-07, "loss": 0.049, "num_input_tokens_seen": 39480032, "step": 39250 }, { "epoch": 18.507779349363506, "grad_norm": 0.06539181619882584, "learning_rate": 8.436211242149577e-07, "loss": 0.1415, "num_input_tokens_seen": 39485088, "step": 39255 }, { "epoch": 18.51013672795851, "grad_norm": 0.31698668003082275, "learning_rate": 8.409736296090792e-07, "loss": 0.0926, "num_input_tokens_seen": 39490784, "step": 39260 }, { "epoch": 18.51249410655351, "grad_norm": 1.0761951208114624, "learning_rate": 8.383302247161689e-07, "loss": 0.1875, "num_input_tokens_seen": 39495488, "step": 39265 }, { "epoch": 18.514851485148515, "grad_norm": 1.0812993049621582, "learning_rate": 8.356909099837107e-07, "loss": 0.2041, "num_input_tokens_seen": 39500512, "step": 39270 }, { "epoch": 18.517208863743516, "grad_norm": 2.005314588546753, "learning_rate": 8.330556858584915e-07, "loss": 0.0673, "num_input_tokens_seen": 39503840, "step": 39275 }, { "epoch": 18.51956624233852, "grad_norm": 0.048918917775154114, "learning_rate": 8.304245527866156e-07, "loss": 0.0269, "num_input_tokens_seen": 39508608, "step": 39280 }, { "epoch": 18.52192362093352, "grad_norm": 1.3736355304718018, "learning_rate": 8.277975112134878e-07, "loss": 0.3023, "num_input_tokens_seen": 39512928, "step": 39285 }, { "epoch": 18.524280999528525, "grad_norm": 1.402951955795288, "learning_rate": 8.251745615838192e-07, "loss": 0.1166, "num_input_tokens_seen": 39517344, "step": 39290 }, { "epoch": 18.526638378123526, "grad_norm": 1.2441401481628418, "learning_rate": 8.225557043416349e-07, "loss": 0.2468, "num_input_tokens_seen": 39522656, "step": 39295 }, { "epoch": 18.52899575671853, "grad_norm": 0.23925912380218506, "learning_rate": 8.199409399302582e-07, "loss": 0.1155, "num_input_tokens_seen": 39527488, "step": 39300 }, { "epoch": 18.53135313531353, "grad_norm": 2.7826013565063477, "learning_rate": 8.17330268792324e-07, "loss": 0.1965, "num_input_tokens_seen": 39531456, "step": 39305 }, { "epoch": 18.533710513908535, "grad_norm": 0.09705385565757751, "learning_rate": 8.147236913697787e-07, "loss": 0.043, "num_input_tokens_seen": 39535968, "step": 39310 }, { "epoch": 18.536067892503535, "grad_norm": 1.7144103050231934, "learning_rate": 8.121212081038721e-07, "loss": 0.0509, "num_input_tokens_seen": 39541376, "step": 39315 }, { "epoch": 18.53842527109854, "grad_norm": 0.32884666323661804, "learning_rate": 8.095228194351545e-07, "loss": 0.1655, "num_input_tokens_seen": 39546368, "step": 39320 }, { "epoch": 18.54078264969354, "grad_norm": 0.07001490145921707, "learning_rate": 8.069285258034936e-07, "loss": 0.0174, "num_input_tokens_seen": 39551200, "step": 39325 }, { "epoch": 18.543140028288544, "grad_norm": 0.9297005534172058, "learning_rate": 8.043383276480603e-07, "loss": 0.1291, "num_input_tokens_seen": 39556736, "step": 39330 }, { "epoch": 18.545497406883545, "grad_norm": 0.12432265281677246, "learning_rate": 8.017522254073289e-07, "loss": 0.1175, "num_input_tokens_seen": 39561856, "step": 39335 }, { "epoch": 18.54785478547855, "grad_norm": 0.9840418696403503, "learning_rate": 7.991702195190854e-07, "loss": 0.1836, "num_input_tokens_seen": 39566816, "step": 39340 }, { "epoch": 18.55021216407355, "grad_norm": 2.227818489074707, "learning_rate": 7.965923104204131e-07, "loss": 0.1644, "num_input_tokens_seen": 39571808, "step": 39345 }, { "epoch": 18.552569542668554, "grad_norm": 1.6195579767227173, "learning_rate": 7.940184985477133e-07, "loss": 0.0728, "num_input_tokens_seen": 39576544, "step": 39350 }, { "epoch": 18.554926921263554, "grad_norm": 0.04848271980881691, "learning_rate": 7.914487843366874e-07, "loss": 0.0288, "num_input_tokens_seen": 39580864, "step": 39355 }, { "epoch": 18.55728429985856, "grad_norm": 0.17110571265220642, "learning_rate": 7.88883168222343e-07, "loss": 0.0355, "num_input_tokens_seen": 39585824, "step": 39360 }, { "epoch": 18.55964167845356, "grad_norm": 1.6985913515090942, "learning_rate": 7.863216506389964e-07, "loss": 0.1453, "num_input_tokens_seen": 39591072, "step": 39365 }, { "epoch": 18.561999057048563, "grad_norm": 0.12528109550476074, "learning_rate": 7.837642320202732e-07, "loss": 0.1876, "num_input_tokens_seen": 39595616, "step": 39370 }, { "epoch": 18.564356435643564, "grad_norm": 1.854185938835144, "learning_rate": 7.812109127990935e-07, "loss": 0.188, "num_input_tokens_seen": 39600608, "step": 39375 }, { "epoch": 18.566713814238568, "grad_norm": 1.7681982517242432, "learning_rate": 7.786616934076979e-07, "loss": 0.1367, "num_input_tokens_seen": 39604832, "step": 39380 }, { "epoch": 18.56907119283357, "grad_norm": 1.4749294519424438, "learning_rate": 7.761165742776217e-07, "loss": 0.2744, "num_input_tokens_seen": 39610144, "step": 39385 }, { "epoch": 18.571428571428573, "grad_norm": 0.16082674264907837, "learning_rate": 7.73575555839709e-07, "loss": 0.1263, "num_input_tokens_seen": 39615168, "step": 39390 }, { "epoch": 18.573785950023574, "grad_norm": 1.4752224683761597, "learning_rate": 7.710386385241158e-07, "loss": 0.1583, "num_input_tokens_seen": 39619392, "step": 39395 }, { "epoch": 18.576143328618578, "grad_norm": 0.8666867613792419, "learning_rate": 7.685058227602959e-07, "loss": 0.0661, "num_input_tokens_seen": 39623392, "step": 39400 }, { "epoch": 18.57850070721358, "grad_norm": 0.20661543309688568, "learning_rate": 7.659771089770118e-07, "loss": 0.1325, "num_input_tokens_seen": 39627456, "step": 39405 }, { "epoch": 18.580858085808583, "grad_norm": 0.8937453031539917, "learning_rate": 7.634524976023349e-07, "loss": 0.1456, "num_input_tokens_seen": 39632096, "step": 39410 }, { "epoch": 18.583215464403583, "grad_norm": 0.15325899422168732, "learning_rate": 7.609319890636346e-07, "loss": 0.1843, "num_input_tokens_seen": 39637632, "step": 39415 }, { "epoch": 18.585572842998587, "grad_norm": 0.058458853513002396, "learning_rate": 7.584155837875973e-07, "loss": 0.1558, "num_input_tokens_seen": 39642432, "step": 39420 }, { "epoch": 18.587930221593588, "grad_norm": 0.031460586935281754, "learning_rate": 7.559032822001988e-07, "loss": 0.0229, "num_input_tokens_seen": 39647808, "step": 39425 }, { "epoch": 18.59028760018859, "grad_norm": 1.274845004081726, "learning_rate": 7.533950847267351e-07, "loss": 0.1226, "num_input_tokens_seen": 39652864, "step": 39430 }, { "epoch": 18.592644978783593, "grad_norm": 0.4723230302333832, "learning_rate": 7.508909917917972e-07, "loss": 0.0955, "num_input_tokens_seen": 39658560, "step": 39435 }, { "epoch": 18.595002357378593, "grad_norm": 0.22020046412944794, "learning_rate": 7.483910038192904e-07, "loss": 0.0357, "num_input_tokens_seen": 39663168, "step": 39440 }, { "epoch": 18.597359735973598, "grad_norm": 0.43384140729904175, "learning_rate": 7.458951212324178e-07, "loss": 0.1306, "num_input_tokens_seen": 39667712, "step": 39445 }, { "epoch": 18.599717114568598, "grad_norm": 0.04892818257212639, "learning_rate": 7.434033444536886e-07, "loss": 0.1479, "num_input_tokens_seen": 39672256, "step": 39450 }, { "epoch": 18.602074493163602, "grad_norm": 1.7465554475784302, "learning_rate": 7.409156739049183e-07, "loss": 0.0728, "num_input_tokens_seen": 39676608, "step": 39455 }, { "epoch": 18.604431871758603, "grad_norm": 0.11365614086389542, "learning_rate": 7.384321100072339e-07, "loss": 0.0778, "num_input_tokens_seen": 39680992, "step": 39460 }, { "epoch": 18.606789250353607, "grad_norm": 0.42528581619262695, "learning_rate": 7.359526531810518e-07, "loss": 0.1383, "num_input_tokens_seen": 39686048, "step": 39465 }, { "epoch": 18.609146628948608, "grad_norm": 0.012931374832987785, "learning_rate": 7.334773038461056e-07, "loss": 0.2182, "num_input_tokens_seen": 39691584, "step": 39470 }, { "epoch": 18.611504007543612, "grad_norm": 0.023907791823148727, "learning_rate": 7.310060624214271e-07, "loss": 0.0293, "num_input_tokens_seen": 39697504, "step": 39475 }, { "epoch": 18.613861386138613, "grad_norm": 0.03788882866501808, "learning_rate": 7.285389293253619e-07, "loss": 0.0858, "num_input_tokens_seen": 39702816, "step": 39480 }, { "epoch": 18.616218764733617, "grad_norm": 0.7950799465179443, "learning_rate": 7.260759049755484e-07, "loss": 0.0385, "num_input_tokens_seen": 39709792, "step": 39485 }, { "epoch": 18.618576143328617, "grad_norm": 0.48175716400146484, "learning_rate": 7.236169897889361e-07, "loss": 0.141, "num_input_tokens_seen": 39714368, "step": 39490 }, { "epoch": 18.62093352192362, "grad_norm": 0.061369724571704865, "learning_rate": 7.211621841817784e-07, "loss": 0.0161, "num_input_tokens_seen": 39719392, "step": 39495 }, { "epoch": 18.623290900518622, "grad_norm": 1.54899001121521, "learning_rate": 7.187114885696345e-07, "loss": 0.1927, "num_input_tokens_seen": 39723904, "step": 39500 }, { "epoch": 18.625648279113626, "grad_norm": 1.3954317569732666, "learning_rate": 7.162649033673585e-07, "loss": 0.0882, "num_input_tokens_seen": 39728000, "step": 39505 }, { "epoch": 18.628005657708627, "grad_norm": 0.2790425419807434, "learning_rate": 7.138224289891221e-07, "loss": 0.0375, "num_input_tokens_seen": 39732608, "step": 39510 }, { "epoch": 18.63036303630363, "grad_norm": 0.17016150057315826, "learning_rate": 7.113840658483917e-07, "loss": 0.0125, "num_input_tokens_seen": 39738144, "step": 39515 }, { "epoch": 18.632720414898632, "grad_norm": 1.6473524570465088, "learning_rate": 7.089498143579426e-07, "loss": 0.0554, "num_input_tokens_seen": 39743456, "step": 39520 }, { "epoch": 18.635077793493636, "grad_norm": 1.919450044631958, "learning_rate": 7.065196749298508e-07, "loss": 0.2226, "num_input_tokens_seen": 39749120, "step": 39525 }, { "epoch": 18.637435172088637, "grad_norm": 0.6891819834709167, "learning_rate": 7.040936479754984e-07, "loss": 0.1352, "num_input_tokens_seen": 39753504, "step": 39530 }, { "epoch": 18.63979255068364, "grad_norm": 0.2295583188533783, "learning_rate": 7.016717339055706e-07, "loss": 0.02, "num_input_tokens_seen": 39757984, "step": 39535 }, { "epoch": 18.64214992927864, "grad_norm": 0.2559640407562256, "learning_rate": 6.99253933130059e-07, "loss": 0.0798, "num_input_tokens_seen": 39763136, "step": 39540 }, { "epoch": 18.644507307873646, "grad_norm": 0.13584381341934204, "learning_rate": 6.968402460582502e-07, "loss": 0.0883, "num_input_tokens_seen": 39768416, "step": 39545 }, { "epoch": 18.646864686468646, "grad_norm": 1.2318958044052124, "learning_rate": 6.944306730987421e-07, "loss": 0.0637, "num_input_tokens_seen": 39774208, "step": 39550 }, { "epoch": 18.64922206506365, "grad_norm": 0.16561421751976013, "learning_rate": 6.920252146594364e-07, "loss": 0.016, "num_input_tokens_seen": 39778080, "step": 39555 }, { "epoch": 18.65157944365865, "grad_norm": 0.9545009136199951, "learning_rate": 6.896238711475378e-07, "loss": 0.1284, "num_input_tokens_seen": 39784608, "step": 39560 }, { "epoch": 18.653936822253655, "grad_norm": 1.4868338108062744, "learning_rate": 6.872266429695518e-07, "loss": 0.1575, "num_input_tokens_seen": 39788736, "step": 39565 }, { "epoch": 18.656294200848656, "grad_norm": 0.04543665051460266, "learning_rate": 6.84833530531287e-07, "loss": 0.0156, "num_input_tokens_seen": 39793600, "step": 39570 }, { "epoch": 18.65865157944366, "grad_norm": 0.8279416561126709, "learning_rate": 6.824445342378583e-07, "loss": 0.1292, "num_input_tokens_seen": 39799104, "step": 39575 }, { "epoch": 18.66100895803866, "grad_norm": 1.5484288930892944, "learning_rate": 6.800596544936783e-07, "loss": 0.1113, "num_input_tokens_seen": 39803872, "step": 39580 }, { "epoch": 18.663366336633665, "grad_norm": 0.028004251420497894, "learning_rate": 6.776788917024713e-07, "loss": 0.3191, "num_input_tokens_seen": 39809504, "step": 39585 }, { "epoch": 18.665723715228665, "grad_norm": 0.1105801984667778, "learning_rate": 6.753022462672592e-07, "loss": 0.0296, "num_input_tokens_seen": 39813664, "step": 39590 }, { "epoch": 18.66808109382367, "grad_norm": 2.890259027481079, "learning_rate": 6.729297185903677e-07, "loss": 0.0908, "num_input_tokens_seen": 39821312, "step": 39595 }, { "epoch": 18.67043847241867, "grad_norm": 0.22632794082164764, "learning_rate": 6.705613090734225e-07, "loss": 0.0323, "num_input_tokens_seen": 39825856, "step": 39600 }, { "epoch": 18.672795851013674, "grad_norm": 0.7847054600715637, "learning_rate": 6.681970181173585e-07, "loss": 0.0556, "num_input_tokens_seen": 39830912, "step": 39605 }, { "epoch": 18.675153229608675, "grad_norm": 0.09038280695676804, "learning_rate": 6.658368461224085e-07, "loss": 0.0655, "num_input_tokens_seen": 39835936, "step": 39610 }, { "epoch": 18.677510608203676, "grad_norm": 0.7416695356369019, "learning_rate": 6.634807934881082e-07, "loss": 0.1309, "num_input_tokens_seen": 39841440, "step": 39615 }, { "epoch": 18.67986798679868, "grad_norm": 0.6739674210548401, "learning_rate": 6.611288606132998e-07, "loss": 0.1338, "num_input_tokens_seen": 39845760, "step": 39620 }, { "epoch": 18.68222536539368, "grad_norm": 1.249297857284546, "learning_rate": 6.587810478961259e-07, "loss": 0.3115, "num_input_tokens_seen": 39850848, "step": 39625 }, { "epoch": 18.684582743988685, "grad_norm": 0.08146951347589493, "learning_rate": 6.564373557340326e-07, "loss": 0.0286, "num_input_tokens_seen": 39856384, "step": 39630 }, { "epoch": 18.686940122583685, "grad_norm": 2.936699151992798, "learning_rate": 6.540977845237606e-07, "loss": 0.1787, "num_input_tokens_seen": 39860864, "step": 39635 }, { "epoch": 18.68929750117869, "grad_norm": 0.033566974103450775, "learning_rate": 6.517623346613654e-07, "loss": 0.0826, "num_input_tokens_seen": 39866080, "step": 39640 }, { "epoch": 18.69165487977369, "grad_norm": 2.0919039249420166, "learning_rate": 6.494310065421977e-07, "loss": 0.245, "num_input_tokens_seen": 39871936, "step": 39645 }, { "epoch": 18.694012258368694, "grad_norm": 0.17042654752731323, "learning_rate": 6.471038005609109e-07, "loss": 0.064, "num_input_tokens_seen": 39876512, "step": 39650 }, { "epoch": 18.696369636963695, "grad_norm": 0.1561994105577469, "learning_rate": 6.447807171114622e-07, "loss": 0.0385, "num_input_tokens_seen": 39880992, "step": 39655 }, { "epoch": 18.6987270155587, "grad_norm": 0.7110235095024109, "learning_rate": 6.424617565871094e-07, "loss": 0.0616, "num_input_tokens_seen": 39886208, "step": 39660 }, { "epoch": 18.7010843941537, "grad_norm": 0.2165270447731018, "learning_rate": 6.401469193804133e-07, "loss": 0.1621, "num_input_tokens_seen": 39892192, "step": 39665 }, { "epoch": 18.703441772748704, "grad_norm": 0.3152214288711548, "learning_rate": 6.37836205883241e-07, "loss": 0.0417, "num_input_tokens_seen": 39897824, "step": 39670 }, { "epoch": 18.705799151343705, "grad_norm": 0.5618986487388611, "learning_rate": 6.355296164867491e-07, "loss": 0.2497, "num_input_tokens_seen": 39903456, "step": 39675 }, { "epoch": 18.70815652993871, "grad_norm": 2.7307777404785156, "learning_rate": 6.332271515814087e-07, "loss": 0.22, "num_input_tokens_seen": 39908160, "step": 39680 }, { "epoch": 18.71051390853371, "grad_norm": 0.8713597059249878, "learning_rate": 6.309288115569884e-07, "loss": 0.1645, "num_input_tokens_seen": 39913696, "step": 39685 }, { "epoch": 18.712871287128714, "grad_norm": 0.041486941277980804, "learning_rate": 6.286345968025548e-07, "loss": 0.1978, "num_input_tokens_seen": 39918432, "step": 39690 }, { "epoch": 18.715228665723714, "grad_norm": 0.06329599022865295, "learning_rate": 6.263445077064833e-07, "loss": 0.043, "num_input_tokens_seen": 39923872, "step": 39695 }, { "epoch": 18.71758604431872, "grad_norm": 0.041647858917713165, "learning_rate": 6.240585446564446e-07, "loss": 0.0253, "num_input_tokens_seen": 39928928, "step": 39700 }, { "epoch": 18.71994342291372, "grad_norm": 1.7085061073303223, "learning_rate": 6.217767080394149e-07, "loss": 0.0518, "num_input_tokens_seen": 39933536, "step": 39705 }, { "epoch": 18.722300801508723, "grad_norm": 1.2514797449111938, "learning_rate": 6.194989982416715e-07, "loss": 0.2999, "num_input_tokens_seen": 39938048, "step": 39710 }, { "epoch": 18.724658180103724, "grad_norm": 0.5761776566505432, "learning_rate": 6.172254156487894e-07, "loss": 0.0915, "num_input_tokens_seen": 39942752, "step": 39715 }, { "epoch": 18.727015558698728, "grad_norm": 0.8891804218292236, "learning_rate": 6.149559606456468e-07, "loss": 0.1914, "num_input_tokens_seen": 39946784, "step": 39720 }, { "epoch": 18.72937293729373, "grad_norm": 0.23882202804088593, "learning_rate": 6.126906336164279e-07, "loss": 0.0627, "num_input_tokens_seen": 39952384, "step": 39725 }, { "epoch": 18.731730315888733, "grad_norm": 0.31070923805236816, "learning_rate": 6.104294349446094e-07, "loss": 0.0264, "num_input_tokens_seen": 39956672, "step": 39730 }, { "epoch": 18.734087694483733, "grad_norm": 0.1879836469888687, "learning_rate": 6.081723650129767e-07, "loss": 0.1414, "num_input_tokens_seen": 39962368, "step": 39735 }, { "epoch": 18.736445073078738, "grad_norm": 0.6954750418663025, "learning_rate": 6.059194242036131e-07, "loss": 0.2651, "num_input_tokens_seen": 39969728, "step": 39740 }, { "epoch": 18.738802451673738, "grad_norm": 0.03321649506688118, "learning_rate": 6.036706128979053e-07, "loss": 0.1204, "num_input_tokens_seen": 39974656, "step": 39745 }, { "epoch": 18.741159830268742, "grad_norm": 0.08580910414457321, "learning_rate": 6.014259314765375e-07, "loss": 0.2363, "num_input_tokens_seen": 39978976, "step": 39750 }, { "epoch": 18.743517208863743, "grad_norm": 0.21892882883548737, "learning_rate": 5.991853803194919e-07, "loss": 0.1488, "num_input_tokens_seen": 39984000, "step": 39755 }, { "epoch": 18.745874587458747, "grad_norm": 0.3578975796699524, "learning_rate": 5.969489598060596e-07, "loss": 0.0711, "num_input_tokens_seen": 39988512, "step": 39760 }, { "epoch": 18.748231966053748, "grad_norm": 1.3909409046173096, "learning_rate": 5.947166703148293e-07, "loss": 0.2039, "num_input_tokens_seen": 39992832, "step": 39765 }, { "epoch": 18.750589344648752, "grad_norm": 1.4068756103515625, "learning_rate": 5.924885122236851e-07, "loss": 0.0441, "num_input_tokens_seen": 39999072, "step": 39770 }, { "epoch": 18.752946723243753, "grad_norm": 0.0448385551571846, "learning_rate": 5.90264485909825e-07, "loss": 0.0406, "num_input_tokens_seen": 40004352, "step": 39775 }, { "epoch": 18.755304101838757, "grad_norm": 0.11061663925647736, "learning_rate": 5.880445917497284e-07, "loss": 0.1138, "num_input_tokens_seen": 40010432, "step": 39780 }, { "epoch": 18.757661480433757, "grad_norm": 0.11271563917398453, "learning_rate": 5.858288301191949e-07, "loss": 0.2268, "num_input_tokens_seen": 40015296, "step": 39785 }, { "epoch": 18.76001885902876, "grad_norm": 0.9144615530967712, "learning_rate": 5.836172013933133e-07, "loss": 0.0311, "num_input_tokens_seen": 40020960, "step": 39790 }, { "epoch": 18.762376237623762, "grad_norm": 0.6477385759353638, "learning_rate": 5.814097059464702e-07, "loss": 0.2128, "num_input_tokens_seen": 40026272, "step": 39795 }, { "epoch": 18.764733616218766, "grad_norm": 0.05633830279111862, "learning_rate": 5.792063441523609e-07, "loss": 0.0159, "num_input_tokens_seen": 40031232, "step": 39800 }, { "epoch": 18.767090994813767, "grad_norm": 0.2429458498954773, "learning_rate": 5.770071163839763e-07, "loss": 0.1002, "num_input_tokens_seen": 40036032, "step": 39805 }, { "epoch": 18.76944837340877, "grad_norm": 0.003739571664482355, "learning_rate": 5.748120230136045e-07, "loss": 0.0968, "num_input_tokens_seen": 40040832, "step": 39810 }, { "epoch": 18.77180575200377, "grad_norm": 0.12288078665733337, "learning_rate": 5.726210644128427e-07, "loss": 0.0528, "num_input_tokens_seen": 40045664, "step": 39815 }, { "epoch": 18.774163130598772, "grad_norm": 0.39622461795806885, "learning_rate": 5.704342409525831e-07, "loss": 0.088, "num_input_tokens_seen": 40053024, "step": 39820 }, { "epoch": 18.776520509193777, "grad_norm": 3.39848256111145, "learning_rate": 5.682515530030158e-07, "loss": 0.1828, "num_input_tokens_seen": 40056928, "step": 39825 }, { "epoch": 18.778877887788777, "grad_norm": 0.31051260232925415, "learning_rate": 5.660730009336312e-07, "loss": 0.053, "num_input_tokens_seen": 40061888, "step": 39830 }, { "epoch": 18.78123526638378, "grad_norm": 0.13346344232559204, "learning_rate": 5.638985851132234e-07, "loss": 0.0805, "num_input_tokens_seen": 40067552, "step": 39835 }, { "epoch": 18.783592644978782, "grad_norm": 0.7349955439567566, "learning_rate": 5.617283059098865e-07, "loss": 0.0252, "num_input_tokens_seen": 40073280, "step": 39840 }, { "epoch": 18.785950023573786, "grad_norm": 1.7803592681884766, "learning_rate": 5.595621636910075e-07, "loss": 0.2854, "num_input_tokens_seen": 40079584, "step": 39845 }, { "epoch": 18.788307402168787, "grad_norm": 0.06615635007619858, "learning_rate": 5.574001588232791e-07, "loss": 0.0205, "num_input_tokens_seen": 40083936, "step": 39850 }, { "epoch": 18.79066478076379, "grad_norm": 0.4767932891845703, "learning_rate": 5.552422916726918e-07, "loss": 0.106, "num_input_tokens_seen": 40087936, "step": 39855 }, { "epoch": 18.79302215935879, "grad_norm": 0.05090690404176712, "learning_rate": 5.53088562604534e-07, "loss": 0.0457, "num_input_tokens_seen": 40091296, "step": 39860 }, { "epoch": 18.795379537953796, "grad_norm": 0.03699429705739021, "learning_rate": 5.509389719834002e-07, "loss": 0.0172, "num_input_tokens_seen": 40096608, "step": 39865 }, { "epoch": 18.797736916548796, "grad_norm": 0.43295198678970337, "learning_rate": 5.48793520173177e-07, "loss": 0.1239, "num_input_tokens_seen": 40101824, "step": 39870 }, { "epoch": 18.8000942951438, "grad_norm": 0.33913955092430115, "learning_rate": 5.466522075370517e-07, "loss": 0.0354, "num_input_tokens_seen": 40106432, "step": 39875 }, { "epoch": 18.8024516737388, "grad_norm": 1.352807879447937, "learning_rate": 5.445150344375178e-07, "loss": 0.12, "num_input_tokens_seen": 40113408, "step": 39880 }, { "epoch": 18.804809052333805, "grad_norm": 0.8303845524787903, "learning_rate": 5.42382001236355e-07, "loss": 0.0595, "num_input_tokens_seen": 40117888, "step": 39885 }, { "epoch": 18.807166430928806, "grad_norm": 1.8018943071365356, "learning_rate": 5.402531082946499e-07, "loss": 0.1484, "num_input_tokens_seen": 40123360, "step": 39890 }, { "epoch": 18.80952380952381, "grad_norm": 0.014411495998501778, "learning_rate": 5.381283559727945e-07, "loss": 0.1145, "num_input_tokens_seen": 40128864, "step": 39895 }, { "epoch": 18.81188118811881, "grad_norm": 1.4983903169631958, "learning_rate": 5.360077446304679e-07, "loss": 0.1867, "num_input_tokens_seen": 40134688, "step": 39900 }, { "epoch": 18.814238566713815, "grad_norm": 0.015267894603312016, "learning_rate": 5.33891274626655e-07, "loss": 0.0893, "num_input_tokens_seen": 40139488, "step": 39905 }, { "epoch": 18.816595945308816, "grad_norm": 0.48032569885253906, "learning_rate": 5.317789463196388e-07, "loss": 0.0409, "num_input_tokens_seen": 40144320, "step": 39910 }, { "epoch": 18.81895332390382, "grad_norm": 0.03403768688440323, "learning_rate": 5.296707600669998e-07, "loss": 0.0211, "num_input_tokens_seen": 40149568, "step": 39915 }, { "epoch": 18.82131070249882, "grad_norm": 0.939904510974884, "learning_rate": 5.27566716225622e-07, "loss": 0.2048, "num_input_tokens_seen": 40153632, "step": 39920 }, { "epoch": 18.823668081093825, "grad_norm": 0.2850344479084015, "learning_rate": 5.254668151516762e-07, "loss": 0.075, "num_input_tokens_seen": 40159296, "step": 39925 }, { "epoch": 18.826025459688825, "grad_norm": 0.7862595915794373, "learning_rate": 5.233710572006473e-07, "loss": 0.1785, "num_input_tokens_seen": 40165088, "step": 39930 }, { "epoch": 18.82838283828383, "grad_norm": 0.4035104513168335, "learning_rate": 5.212794427273071e-07, "loss": 0.0268, "num_input_tokens_seen": 40170208, "step": 39935 }, { "epoch": 18.83074021687883, "grad_norm": 1.0270581245422363, "learning_rate": 5.191919720857335e-07, "loss": 0.137, "num_input_tokens_seen": 40174336, "step": 39940 }, { "epoch": 18.833097595473834, "grad_norm": 1.306897759437561, "learning_rate": 5.171086456292967e-07, "loss": 0.1437, "num_input_tokens_seen": 40179136, "step": 39945 }, { "epoch": 18.835454974068835, "grad_norm": 0.7152137160301208, "learning_rate": 5.150294637106728e-07, "loss": 0.0554, "num_input_tokens_seen": 40184352, "step": 39950 }, { "epoch": 18.83781235266384, "grad_norm": 0.13282951712608337, "learning_rate": 5.129544266818276e-07, "loss": 0.0812, "num_input_tokens_seen": 40189408, "step": 39955 }, { "epoch": 18.84016973125884, "grad_norm": 1.1171797513961792, "learning_rate": 5.108835348940355e-07, "loss": 0.1997, "num_input_tokens_seen": 40194400, "step": 39960 }, { "epoch": 18.842527109853844, "grad_norm": 0.17967405915260315, "learning_rate": 5.088167886978579e-07, "loss": 0.0416, "num_input_tokens_seen": 40199488, "step": 39965 }, { "epoch": 18.844884488448844, "grad_norm": 0.8027732372283936, "learning_rate": 5.067541884431593e-07, "loss": 0.1078, "num_input_tokens_seen": 40205120, "step": 39970 }, { "epoch": 18.84724186704385, "grad_norm": 0.8865215182304382, "learning_rate": 5.046957344791048e-07, "loss": 0.1288, "num_input_tokens_seen": 40211712, "step": 39975 }, { "epoch": 18.84959924563885, "grad_norm": 0.2934248149394989, "learning_rate": 5.026414271541574e-07, "loss": 0.0691, "num_input_tokens_seen": 40217152, "step": 39980 }, { "epoch": 18.851956624233853, "grad_norm": 1.7079782485961914, "learning_rate": 5.005912668160723e-07, "loss": 0.1671, "num_input_tokens_seen": 40221856, "step": 39985 }, { "epoch": 18.854314002828854, "grad_norm": 0.5498601198196411, "learning_rate": 4.985452538119134e-07, "loss": 0.1195, "num_input_tokens_seen": 40227232, "step": 39990 }, { "epoch": 18.85667138142386, "grad_norm": 0.0533800944685936, "learning_rate": 4.965033884880288e-07, "loss": 0.0588, "num_input_tokens_seen": 40231648, "step": 39995 }, { "epoch": 18.85902876001886, "grad_norm": 0.36933183670043945, "learning_rate": 4.94465671190078e-07, "loss": 0.0151, "num_input_tokens_seen": 40236128, "step": 40000 }, { "epoch": 18.861386138613863, "grad_norm": 1.8696484565734863, "learning_rate": 4.924321022630074e-07, "loss": 0.0895, "num_input_tokens_seen": 40241088, "step": 40005 }, { "epoch": 18.863743517208864, "grad_norm": 0.4627905488014221, "learning_rate": 4.904026820510665e-07, "loss": 0.0867, "num_input_tokens_seen": 40245888, "step": 40010 }, { "epoch": 18.866100895803868, "grad_norm": 0.07898390293121338, "learning_rate": 4.883774108978029e-07, "loss": 0.0626, "num_input_tokens_seen": 40250944, "step": 40015 }, { "epoch": 18.86845827439887, "grad_norm": 0.037283338606357574, "learning_rate": 4.863562891460588e-07, "loss": 0.156, "num_input_tokens_seen": 40255968, "step": 40020 }, { "epoch": 18.87081565299387, "grad_norm": 0.3762058913707733, "learning_rate": 4.843393171379773e-07, "loss": 0.0963, "num_input_tokens_seen": 40261216, "step": 40025 }, { "epoch": 18.873173031588873, "grad_norm": 0.1955961138010025, "learning_rate": 4.823264952149964e-07, "loss": 0.0298, "num_input_tokens_seen": 40267104, "step": 40030 }, { "epoch": 18.875530410183874, "grad_norm": 0.08134309947490692, "learning_rate": 4.803178237178546e-07, "loss": 0.2027, "num_input_tokens_seen": 40272576, "step": 40035 }, { "epoch": 18.877887788778878, "grad_norm": 0.2643662393093109, "learning_rate": 4.783133029865855e-07, "loss": 0.109, "num_input_tokens_seen": 40277088, "step": 40040 }, { "epoch": 18.88024516737388, "grad_norm": 1.3514195680618286, "learning_rate": 4.763129333605176e-07, "loss": 0.0756, "num_input_tokens_seen": 40282112, "step": 40045 }, { "epoch": 18.882602545968883, "grad_norm": 0.45144274830818176, "learning_rate": 4.743167151782857e-07, "loss": 0.1001, "num_input_tokens_seen": 40286528, "step": 40050 }, { "epoch": 18.884959924563884, "grad_norm": 0.20030535757541656, "learning_rate": 4.7232464877780824e-07, "loss": 0.1566, "num_input_tokens_seen": 40292384, "step": 40055 }, { "epoch": 18.887317303158888, "grad_norm": 1.0859798192977905, "learning_rate": 4.703367344963128e-07, "loss": 0.1333, "num_input_tokens_seen": 40298720, "step": 40060 }, { "epoch": 18.88967468175389, "grad_norm": 0.3457334041595459, "learning_rate": 4.683529726703162e-07, "loss": 0.0778, "num_input_tokens_seen": 40302592, "step": 40065 }, { "epoch": 18.892032060348892, "grad_norm": 0.30423012375831604, "learning_rate": 4.6637336363563876e-07, "loss": 0.1347, "num_input_tokens_seen": 40309536, "step": 40070 }, { "epoch": 18.894389438943893, "grad_norm": 1.0436469316482544, "learning_rate": 4.64397907727393e-07, "loss": 0.269, "num_input_tokens_seen": 40314560, "step": 40075 }, { "epoch": 18.896746817538897, "grad_norm": 0.11724196374416351, "learning_rate": 4.6242660527999194e-07, "loss": 0.1538, "num_input_tokens_seen": 40321152, "step": 40080 }, { "epoch": 18.899104196133898, "grad_norm": 0.21851427853107452, "learning_rate": 4.6045945662714083e-07, "loss": 0.1146, "num_input_tokens_seen": 40325920, "step": 40085 }, { "epoch": 18.901461574728902, "grad_norm": 1.226678729057312, "learning_rate": 4.5849646210184837e-07, "loss": 0.0472, "num_input_tokens_seen": 40331072, "step": 40090 }, { "epoch": 18.903818953323903, "grad_norm": 0.14543026685714722, "learning_rate": 4.565376220364126e-07, "loss": 0.0106, "num_input_tokens_seen": 40335680, "step": 40095 }, { "epoch": 18.906176331918907, "grad_norm": 1.4966939687728882, "learning_rate": 4.5458293676243214e-07, "loss": 0.1069, "num_input_tokens_seen": 40340384, "step": 40100 }, { "epoch": 18.908533710513908, "grad_norm": 0.41870802640914917, "learning_rate": 4.5263240661080344e-07, "loss": 0.0789, "num_input_tokens_seen": 40345888, "step": 40105 }, { "epoch": 18.91089108910891, "grad_norm": 1.2956507205963135, "learning_rate": 4.5068603191171786e-07, "loss": 0.0794, "num_input_tokens_seen": 40350144, "step": 40110 }, { "epoch": 18.913248467703912, "grad_norm": 0.0414821021258831, "learning_rate": 4.4874381299466475e-07, "loss": 0.1953, "num_input_tokens_seen": 40354912, "step": 40115 }, { "epoch": 18.915605846298917, "grad_norm": 0.9290849566459656, "learning_rate": 4.468057501884254e-07, "loss": 0.0445, "num_input_tokens_seen": 40362400, "step": 40120 }, { "epoch": 18.917963224893917, "grad_norm": 0.1709107756614685, "learning_rate": 4.448718438210819e-07, "loss": 0.0599, "num_input_tokens_seen": 40369760, "step": 40125 }, { "epoch": 18.92032060348892, "grad_norm": 0.6991824507713318, "learning_rate": 4.4294209422001966e-07, "loss": 0.0339, "num_input_tokens_seen": 40374624, "step": 40130 }, { "epoch": 18.922677982083922, "grad_norm": 0.3744472861289978, "learning_rate": 4.4101650171189946e-07, "loss": 0.0931, "num_input_tokens_seen": 40380608, "step": 40135 }, { "epoch": 18.925035360678926, "grad_norm": 0.26036006212234497, "learning_rate": 4.390950666226995e-07, "loss": 0.0337, "num_input_tokens_seen": 40384992, "step": 40140 }, { "epoch": 18.927392739273927, "grad_norm": 0.29922428727149963, "learning_rate": 4.371777892776846e-07, "loss": 0.0712, "num_input_tokens_seen": 40390464, "step": 40145 }, { "epoch": 18.92975011786893, "grad_norm": 0.7883471250534058, "learning_rate": 4.352646700014146e-07, "loss": 0.1516, "num_input_tokens_seen": 40396192, "step": 40150 }, { "epoch": 18.93210749646393, "grad_norm": 0.18270981311798096, "learning_rate": 4.3335570911775257e-07, "loss": 0.039, "num_input_tokens_seen": 40401216, "step": 40155 }, { "epoch": 18.934464875058936, "grad_norm": 0.7390973567962646, "learning_rate": 4.314509069498512e-07, "loss": 0.0622, "num_input_tokens_seen": 40406432, "step": 40160 }, { "epoch": 18.936822253653936, "grad_norm": 2.530463695526123, "learning_rate": 4.2955026382016097e-07, "loss": 0.2014, "num_input_tokens_seen": 40411776, "step": 40165 }, { "epoch": 18.93917963224894, "grad_norm": 0.7256104350090027, "learning_rate": 4.276537800504299e-07, "loss": 0.0628, "num_input_tokens_seen": 40415808, "step": 40170 }, { "epoch": 18.94153701084394, "grad_norm": 1.0537654161453247, "learning_rate": 4.2576145596169845e-07, "loss": 0.1504, "num_input_tokens_seen": 40420864, "step": 40175 }, { "epoch": 18.943894389438945, "grad_norm": 1.1814343929290771, "learning_rate": 4.238732918743049e-07, "loss": 0.0819, "num_input_tokens_seen": 40425152, "step": 40180 }, { "epoch": 18.946251768033946, "grad_norm": 1.4059220552444458, "learning_rate": 4.219892881078824e-07, "loss": 0.1008, "num_input_tokens_seen": 40429600, "step": 40185 }, { "epoch": 18.94860914662895, "grad_norm": 0.2178684026002884, "learning_rate": 4.201094449813647e-07, "loss": 0.0987, "num_input_tokens_seen": 40434272, "step": 40190 }, { "epoch": 18.95096652522395, "grad_norm": 1.309934139251709, "learning_rate": 4.182337628129751e-07, "loss": 0.1234, "num_input_tokens_seen": 40438784, "step": 40195 }, { "epoch": 18.953323903818955, "grad_norm": 0.10367795079946518, "learning_rate": 4.163622419202345e-07, "loss": 0.0894, "num_input_tokens_seen": 40442816, "step": 40200 }, { "epoch": 18.955681282413956, "grad_norm": 0.149761363863945, "learning_rate": 4.144948826199618e-07, "loss": 0.052, "num_input_tokens_seen": 40448384, "step": 40205 }, { "epoch": 18.95803866100896, "grad_norm": 2.1244935989379883, "learning_rate": 4.1263168522826525e-07, "loss": 0.2825, "num_input_tokens_seen": 40453888, "step": 40210 }, { "epoch": 18.96039603960396, "grad_norm": 0.041752465069293976, "learning_rate": 4.1077265006055634e-07, "loss": 0.1606, "num_input_tokens_seen": 40458560, "step": 40215 }, { "epoch": 18.962753418198965, "grad_norm": 0.29931527376174927, "learning_rate": 4.0891777743153334e-07, "loss": 0.1368, "num_input_tokens_seen": 40463520, "step": 40220 }, { "epoch": 18.965110796793965, "grad_norm": 0.21272213757038116, "learning_rate": 4.0706706765520074e-07, "loss": 0.0841, "num_input_tokens_seen": 40469408, "step": 40225 }, { "epoch": 18.967468175388966, "grad_norm": 0.11179907619953156, "learning_rate": 4.0522052104484675e-07, "loss": 0.0218, "num_input_tokens_seen": 40474240, "step": 40230 }, { "epoch": 18.96982555398397, "grad_norm": 0.022395730018615723, "learning_rate": 4.033781379130658e-07, "loss": 0.0552, "num_input_tokens_seen": 40479424, "step": 40235 }, { "epoch": 18.97218293257897, "grad_norm": 0.11395379155874252, "learning_rate": 4.015399185717389e-07, "loss": 0.1793, "num_input_tokens_seen": 40483808, "step": 40240 }, { "epoch": 18.974540311173975, "grad_norm": 0.04421871900558472, "learning_rate": 3.9970586333204496e-07, "loss": 0.1387, "num_input_tokens_seen": 40488352, "step": 40245 }, { "epoch": 18.976897689768975, "grad_norm": 0.15429091453552246, "learning_rate": 3.9787597250445784e-07, "loss": 0.0772, "num_input_tokens_seen": 40493312, "step": 40250 }, { "epoch": 18.97925506836398, "grad_norm": 0.2041464000940323, "learning_rate": 3.960502463987492e-07, "loss": 0.0801, "num_input_tokens_seen": 40498112, "step": 40255 }, { "epoch": 18.98161244695898, "grad_norm": 0.24072493612766266, "learning_rate": 3.942286853239857e-07, "loss": 0.0248, "num_input_tokens_seen": 40502784, "step": 40260 }, { "epoch": 18.983969825553984, "grad_norm": 3.2731032371520996, "learning_rate": 3.924112895885207e-07, "loss": 0.1995, "num_input_tokens_seen": 40507488, "step": 40265 }, { "epoch": 18.986327204148985, "grad_norm": 1.18509840965271, "learning_rate": 3.905980595000108e-07, "loss": 0.1477, "num_input_tokens_seen": 40511744, "step": 40270 }, { "epoch": 18.98868458274399, "grad_norm": 0.47649601101875305, "learning_rate": 3.88788995365405e-07, "loss": 0.0946, "num_input_tokens_seen": 40517600, "step": 40275 }, { "epoch": 18.99104196133899, "grad_norm": 0.02142823301255703, "learning_rate": 3.869840974909472e-07, "loss": 0.0613, "num_input_tokens_seen": 40521856, "step": 40280 }, { "epoch": 18.993399339933994, "grad_norm": 0.030549153685569763, "learning_rate": 3.851833661821791e-07, "loss": 0.076, "num_input_tokens_seen": 40527104, "step": 40285 }, { "epoch": 18.995756718528995, "grad_norm": 0.058710977435112, "learning_rate": 3.8338680174392906e-07, "loss": 0.0429, "num_input_tokens_seen": 40532960, "step": 40290 }, { "epoch": 18.998114097124, "grad_norm": 0.4073282778263092, "learning_rate": 3.815944044803288e-07, "loss": 0.0632, "num_input_tokens_seen": 40538112, "step": 40295 }, { "epoch": 19.0, "eval_loss": 0.15307621657848358, "eval_runtime": 15.1312, "eval_samples_per_second": 62.322, "eval_steps_per_second": 15.597, "num_input_tokens_seen": 40541312, "step": 40299 }, { "epoch": 19.000471475719, "grad_norm": 1.3607136011123657, "learning_rate": 3.7980617469479953e-07, "loss": 0.1769, "num_input_tokens_seen": 40542240, "step": 40300 }, { "epoch": 19.002828854314004, "grad_norm": 0.2798105776309967, "learning_rate": 3.780221126900574e-07, "loss": 0.1543, "num_input_tokens_seen": 40546560, "step": 40305 }, { "epoch": 19.005186232909004, "grad_norm": 0.8466254472732544, "learning_rate": 3.762422187681136e-07, "loss": 0.0701, "num_input_tokens_seen": 40551680, "step": 40310 }, { "epoch": 19.00754361150401, "grad_norm": 1.4841340780258179, "learning_rate": 3.744664932302744e-07, "loss": 0.1493, "num_input_tokens_seen": 40556544, "step": 40315 }, { "epoch": 19.00990099009901, "grad_norm": 2.258136034011841, "learning_rate": 3.7269493637714094e-07, "loss": 0.112, "num_input_tokens_seen": 40561440, "step": 40320 }, { "epoch": 19.012258368694013, "grad_norm": 1.0605649948120117, "learning_rate": 3.709275485086039e-07, "loss": 0.0449, "num_input_tokens_seen": 40566048, "step": 40325 }, { "epoch": 19.014615747289014, "grad_norm": 0.17488902807235718, "learning_rate": 3.691643299238573e-07, "loss": 0.0798, "num_input_tokens_seen": 40570432, "step": 40330 }, { "epoch": 19.016973125884018, "grad_norm": 2.4385251998901367, "learning_rate": 3.674052809213818e-07, "loss": 0.139, "num_input_tokens_seen": 40574432, "step": 40335 }, { "epoch": 19.01933050447902, "grad_norm": 0.0386047437787056, "learning_rate": 3.656504017989587e-07, "loss": 0.0067, "num_input_tokens_seen": 40579040, "step": 40340 }, { "epoch": 19.021687883074023, "grad_norm": 0.06532891094684601, "learning_rate": 3.6389969285365036e-07, "loss": 0.0121, "num_input_tokens_seen": 40584672, "step": 40345 }, { "epoch": 19.024045261669023, "grad_norm": 0.05359989032149315, "learning_rate": 3.6215315438182795e-07, "loss": 0.1155, "num_input_tokens_seen": 40590208, "step": 40350 }, { "epoch": 19.026402640264028, "grad_norm": 0.9439308643341064, "learning_rate": 3.604107866791495e-07, "loss": 0.0943, "num_input_tokens_seen": 40597152, "step": 40355 }, { "epoch": 19.02876001885903, "grad_norm": 1.1996511220932007, "learning_rate": 3.5867259004056795e-07, "loss": 0.0741, "num_input_tokens_seen": 40602656, "step": 40360 }, { "epoch": 19.031117397454032, "grad_norm": 0.12482674419879913, "learning_rate": 3.569385647603313e-07, "loss": 0.0814, "num_input_tokens_seen": 40607008, "step": 40365 }, { "epoch": 19.033474776049033, "grad_norm": 0.09804227203130722, "learning_rate": 3.5520871113197964e-07, "loss": 0.0178, "num_input_tokens_seen": 40611328, "step": 40370 }, { "epoch": 19.035832154644037, "grad_norm": 1.6055636405944824, "learning_rate": 3.5348302944834824e-07, "loss": 0.2212, "num_input_tokens_seen": 40618432, "step": 40375 }, { "epoch": 19.038189533239038, "grad_norm": 0.10050681978464127, "learning_rate": 3.5176152000157006e-07, "loss": 0.0117, "num_input_tokens_seen": 40623296, "step": 40380 }, { "epoch": 19.040546911834042, "grad_norm": 1.498837947845459, "learning_rate": 3.500441830830592e-07, "loss": 0.2193, "num_input_tokens_seen": 40629184, "step": 40385 }, { "epoch": 19.042904290429043, "grad_norm": 0.6538693308830261, "learning_rate": 3.4833101898353594e-07, "loss": 0.0175, "num_input_tokens_seen": 40634112, "step": 40390 }, { "epoch": 19.045261669024047, "grad_norm": 0.06570905447006226, "learning_rate": 3.4662202799300993e-07, "loss": 0.0117, "num_input_tokens_seen": 40640448, "step": 40395 }, { "epoch": 19.047619047619047, "grad_norm": 0.10608214139938354, "learning_rate": 3.4491721040078305e-07, "loss": 0.0238, "num_input_tokens_seen": 40644992, "step": 40400 }, { "epoch": 19.04997642621405, "grad_norm": 0.6353428959846497, "learning_rate": 3.432165664954551e-07, "loss": 0.1355, "num_input_tokens_seen": 40650016, "step": 40405 }, { "epoch": 19.052333804809052, "grad_norm": 0.8875287771224976, "learning_rate": 3.4152009656491235e-07, "loss": 0.1723, "num_input_tokens_seen": 40654976, "step": 40410 }, { "epoch": 19.054691183404056, "grad_norm": 0.30606094002723694, "learning_rate": 3.398278008963418e-07, "loss": 0.0887, "num_input_tokens_seen": 40659296, "step": 40415 }, { "epoch": 19.057048561999057, "grad_norm": 1.0618250370025635, "learning_rate": 3.3813967977621986e-07, "loss": 0.1047, "num_input_tokens_seen": 40665504, "step": 40420 }, { "epoch": 19.059405940594058, "grad_norm": 0.39338526129722595, "learning_rate": 3.3645573349031234e-07, "loss": 0.0933, "num_input_tokens_seen": 40670560, "step": 40425 }, { "epoch": 19.061763319189062, "grad_norm": 0.40655717253685, "learning_rate": 3.3477596232368845e-07, "loss": 0.2101, "num_input_tokens_seen": 40674816, "step": 40430 }, { "epoch": 19.064120697784062, "grad_norm": 0.34208837151527405, "learning_rate": 3.3310036656070134e-07, "loss": 0.0504, "num_input_tokens_seen": 40679520, "step": 40435 }, { "epoch": 19.066478076379067, "grad_norm": 1.049030065536499, "learning_rate": 3.314289464850018e-07, "loss": 0.1672, "num_input_tokens_seen": 40683712, "step": 40440 }, { "epoch": 19.068835454974067, "grad_norm": 0.2039317488670349, "learning_rate": 3.2976170237953295e-07, "loss": 0.1064, "num_input_tokens_seen": 40689344, "step": 40445 }, { "epoch": 19.07119283356907, "grad_norm": 0.037419818341732025, "learning_rate": 3.28098634526533e-07, "loss": 0.1711, "num_input_tokens_seen": 40694240, "step": 40450 }, { "epoch": 19.073550212164072, "grad_norm": 0.7071806788444519, "learning_rate": 3.264397432075267e-07, "loss": 0.0409, "num_input_tokens_seen": 40699616, "step": 40455 }, { "epoch": 19.075907590759076, "grad_norm": 0.5925111770629883, "learning_rate": 3.247850287033422e-07, "loss": 0.0365, "num_input_tokens_seen": 40705344, "step": 40460 }, { "epoch": 19.078264969354077, "grad_norm": 0.16660486161708832, "learning_rate": 3.2313449129408867e-07, "loss": 0.075, "num_input_tokens_seen": 40710112, "step": 40465 }, { "epoch": 19.08062234794908, "grad_norm": 0.7378759980201721, "learning_rate": 3.214881312591761e-07, "loss": 0.0659, "num_input_tokens_seen": 40714944, "step": 40470 }, { "epoch": 19.08297972654408, "grad_norm": 0.21021795272827148, "learning_rate": 3.1984594887730645e-07, "loss": 0.1025, "num_input_tokens_seen": 40720000, "step": 40475 }, { "epoch": 19.085337105139086, "grad_norm": 3.8998379707336426, "learning_rate": 3.182079444264713e-07, "loss": 0.1574, "num_input_tokens_seen": 40727136, "step": 40480 }, { "epoch": 19.087694483734087, "grad_norm": 0.10953086614608765, "learning_rate": 3.165741181839571e-07, "loss": 0.0957, "num_input_tokens_seen": 40731520, "step": 40485 }, { "epoch": 19.09005186232909, "grad_norm": 0.39695245027542114, "learning_rate": 3.1494447042634546e-07, "loss": 0.0815, "num_input_tokens_seen": 40736672, "step": 40490 }, { "epoch": 19.09240924092409, "grad_norm": 0.522955060005188, "learning_rate": 3.133190014295073e-07, "loss": 0.1372, "num_input_tokens_seen": 40741664, "step": 40495 }, { "epoch": 19.094766619519095, "grad_norm": 0.5589240789413452, "learning_rate": 3.116977114686059e-07, "loss": 0.1499, "num_input_tokens_seen": 40746240, "step": 40500 }, { "epoch": 19.097123998114096, "grad_norm": 0.4890235960483551, "learning_rate": 3.100806008180995e-07, "loss": 0.0476, "num_input_tokens_seen": 40751776, "step": 40505 }, { "epoch": 19.0994813767091, "grad_norm": 2.5421786308288574, "learning_rate": 3.084676697517358e-07, "loss": 0.0811, "num_input_tokens_seen": 40757024, "step": 40510 }, { "epoch": 19.1018387553041, "grad_norm": 0.7902030348777771, "learning_rate": 3.0685891854255745e-07, "loss": 0.109, "num_input_tokens_seen": 40762592, "step": 40515 }, { "epoch": 19.104196133899105, "grad_norm": 0.20203666388988495, "learning_rate": 3.0525434746289663e-07, "loss": 0.016, "num_input_tokens_seen": 40767264, "step": 40520 }, { "epoch": 19.106553512494106, "grad_norm": 0.6336272358894348, "learning_rate": 3.0365395678438327e-07, "loss": 0.1039, "num_input_tokens_seen": 40772256, "step": 40525 }, { "epoch": 19.10891089108911, "grad_norm": 0.15909722447395325, "learning_rate": 3.02057746777934e-07, "loss": 0.0175, "num_input_tokens_seen": 40777888, "step": 40530 }, { "epoch": 19.11126826968411, "grad_norm": 0.9089717864990234, "learning_rate": 3.0046571771376043e-07, "loss": 0.0827, "num_input_tokens_seen": 40783200, "step": 40535 }, { "epoch": 19.113625648279115, "grad_norm": 0.8617770075798035, "learning_rate": 2.9887786986136923e-07, "loss": 0.0355, "num_input_tokens_seen": 40788704, "step": 40540 }, { "epoch": 19.115983026874115, "grad_norm": 0.09278248995542526, "learning_rate": 2.9729420348955093e-07, "loss": 0.0449, "num_input_tokens_seen": 40793824, "step": 40545 }, { "epoch": 19.11834040546912, "grad_norm": 1.9265145063400269, "learning_rate": 2.9571471886639946e-07, "loss": 0.0888, "num_input_tokens_seen": 40799168, "step": 40550 }, { "epoch": 19.12069778406412, "grad_norm": 0.05401997268199921, "learning_rate": 2.9413941625928696e-07, "loss": 0.1884, "num_input_tokens_seen": 40804320, "step": 40555 }, { "epoch": 19.123055162659124, "grad_norm": 2.1455187797546387, "learning_rate": 2.925682959348919e-07, "loss": 0.1341, "num_input_tokens_seen": 40808864, "step": 40560 }, { "epoch": 19.125412541254125, "grad_norm": 0.1357075721025467, "learning_rate": 2.910013581591764e-07, "loss": 0.0189, "num_input_tokens_seen": 40813504, "step": 40565 }, { "epoch": 19.12776991984913, "grad_norm": 0.31327593326568604, "learning_rate": 2.8943860319739504e-07, "loss": 0.1071, "num_input_tokens_seen": 40819232, "step": 40570 }, { "epoch": 19.13012729844413, "grad_norm": 0.03715825080871582, "learning_rate": 2.8788003131409444e-07, "loss": 0.0126, "num_input_tokens_seen": 40824192, "step": 40575 }, { "epoch": 19.132484677039134, "grad_norm": 0.11731989681720734, "learning_rate": 2.8632564277311923e-07, "loss": 0.0622, "num_input_tokens_seen": 40828224, "step": 40580 }, { "epoch": 19.134842055634135, "grad_norm": 0.2630273997783661, "learning_rate": 2.847754378376005e-07, "loss": 0.025, "num_input_tokens_seen": 40833152, "step": 40585 }, { "epoch": 19.13719943422914, "grad_norm": 0.0628264769911766, "learning_rate": 2.8322941676995615e-07, "loss": 0.1669, "num_input_tokens_seen": 40838208, "step": 40590 }, { "epoch": 19.13955681282414, "grad_norm": 1.1292489767074585, "learning_rate": 2.816875798319074e-07, "loss": 0.2121, "num_input_tokens_seen": 40845152, "step": 40595 }, { "epoch": 19.141914191419144, "grad_norm": 1.4145270586013794, "learning_rate": 2.8014992728445656e-07, "loss": 0.0517, "num_input_tokens_seen": 40849760, "step": 40600 }, { "epoch": 19.144271570014144, "grad_norm": 0.3013930916786194, "learning_rate": 2.786164593879037e-07, "loss": 0.0234, "num_input_tokens_seen": 40854336, "step": 40605 }, { "epoch": 19.14662894860915, "grad_norm": 0.16689656674861908, "learning_rate": 2.7708717640183844e-07, "loss": 0.0266, "num_input_tokens_seen": 40859264, "step": 40610 }, { "epoch": 19.14898632720415, "grad_norm": 0.7415618896484375, "learning_rate": 2.755620785851426e-07, "loss": 0.0984, "num_input_tokens_seen": 40863168, "step": 40615 }, { "epoch": 19.15134370579915, "grad_norm": 1.8402657508850098, "learning_rate": 2.74041166195993e-07, "loss": 0.1369, "num_input_tokens_seen": 40867360, "step": 40620 }, { "epoch": 19.153701084394154, "grad_norm": 0.03553071618080139, "learning_rate": 2.7252443949185024e-07, "loss": 0.0575, "num_input_tokens_seen": 40871968, "step": 40625 }, { "epoch": 19.156058462989154, "grad_norm": 0.31038200855255127, "learning_rate": 2.7101189872947017e-07, "loss": 0.0675, "num_input_tokens_seen": 40877312, "step": 40630 }, { "epoch": 19.15841584158416, "grad_norm": 1.275126576423645, "learning_rate": 2.6950354416490076e-07, "loss": 0.1266, "num_input_tokens_seen": 40882336, "step": 40635 }, { "epoch": 19.16077322017916, "grad_norm": 0.3675485849380493, "learning_rate": 2.679993760534821e-07, "loss": 0.0732, "num_input_tokens_seen": 40887072, "step": 40640 }, { "epoch": 19.163130598774163, "grad_norm": 1.1981120109558105, "learning_rate": 2.66499394649844e-07, "loss": 0.09, "num_input_tokens_seen": 40891680, "step": 40645 }, { "epoch": 19.165487977369164, "grad_norm": 0.01811908185482025, "learning_rate": 2.650036002079054e-07, "loss": 0.0107, "num_input_tokens_seen": 40895744, "step": 40650 }, { "epoch": 19.167845355964168, "grad_norm": 0.032604239881038666, "learning_rate": 2.6351199298088325e-07, "loss": 0.0977, "num_input_tokens_seen": 40900512, "step": 40655 }, { "epoch": 19.17020273455917, "grad_norm": 0.27121153473854065, "learning_rate": 2.620245732212784e-07, "loss": 0.1371, "num_input_tokens_seen": 40905664, "step": 40660 }, { "epoch": 19.172560113154173, "grad_norm": 1.3726650476455688, "learning_rate": 2.6054134118088656e-07, "loss": 0.3984, "num_input_tokens_seen": 40910176, "step": 40665 }, { "epoch": 19.174917491749174, "grad_norm": 1.2992063760757446, "learning_rate": 2.590622971107931e-07, "loss": 0.2394, "num_input_tokens_seen": 40915232, "step": 40670 }, { "epoch": 19.177274870344178, "grad_norm": 1.222287893295288, "learning_rate": 2.575874412613782e-07, "loss": 0.1941, "num_input_tokens_seen": 40922592, "step": 40675 }, { "epoch": 19.17963224893918, "grad_norm": 2.7608683109283447, "learning_rate": 2.561167738823034e-07, "loss": 0.1501, "num_input_tokens_seen": 40928096, "step": 40680 }, { "epoch": 19.181989627534183, "grad_norm": 0.6314074397087097, "learning_rate": 2.5465029522253327e-07, "loss": 0.0433, "num_input_tokens_seen": 40934912, "step": 40685 }, { "epoch": 19.184347006129183, "grad_norm": 0.8857321739196777, "learning_rate": 2.531880055303165e-07, "loss": 0.0623, "num_input_tokens_seen": 40939680, "step": 40690 }, { "epoch": 19.186704384724187, "grad_norm": 0.03540593385696411, "learning_rate": 2.5172990505319406e-07, "loss": 0.0276, "num_input_tokens_seen": 40943744, "step": 40695 }, { "epoch": 19.189061763319188, "grad_norm": 0.7132664322853088, "learning_rate": 2.502759940379934e-07, "loss": 0.1037, "num_input_tokens_seen": 40949344, "step": 40700 }, { "epoch": 19.191419141914192, "grad_norm": 1.3466968536376953, "learning_rate": 2.488262727308427e-07, "loss": 0.1412, "num_input_tokens_seen": 40954048, "step": 40705 }, { "epoch": 19.193776520509193, "grad_norm": 0.0858595073223114, "learning_rate": 2.4738074137715394e-07, "loss": 0.2777, "num_input_tokens_seen": 40958848, "step": 40710 }, { "epoch": 19.196133899104197, "grad_norm": 0.0425964891910553, "learning_rate": 2.459394002216314e-07, "loss": 0.1787, "num_input_tokens_seen": 40963296, "step": 40715 }, { "epoch": 19.198491277699198, "grad_norm": 0.7181000709533691, "learning_rate": 2.4450224950826604e-07, "loss": 0.0368, "num_input_tokens_seen": 40968128, "step": 40720 }, { "epoch": 19.200848656294202, "grad_norm": 0.3454618752002716, "learning_rate": 2.430692894803438e-07, "loss": 0.1523, "num_input_tokens_seen": 40972832, "step": 40725 }, { "epoch": 19.203206034889202, "grad_norm": 2.032379150390625, "learning_rate": 2.416405203804456e-07, "loss": 0.1449, "num_input_tokens_seen": 40978976, "step": 40730 }, { "epoch": 19.205563413484207, "grad_norm": 0.8575447201728821, "learning_rate": 2.402159424504308e-07, "loss": 0.0396, "num_input_tokens_seen": 40982912, "step": 40735 }, { "epoch": 19.207920792079207, "grad_norm": 1.6506943702697754, "learning_rate": 2.38795555931462e-07, "loss": 0.0957, "num_input_tokens_seen": 40988160, "step": 40740 }, { "epoch": 19.21027817067421, "grad_norm": 0.4735238254070282, "learning_rate": 2.3737936106398295e-07, "loss": 0.081, "num_input_tokens_seen": 40993184, "step": 40745 }, { "epoch": 19.212635549269212, "grad_norm": 0.06417587399482727, "learning_rate": 2.359673580877325e-07, "loss": 0.1504, "num_input_tokens_seen": 40997600, "step": 40750 }, { "epoch": 19.214992927864216, "grad_norm": 2.4487829208374023, "learning_rate": 2.3455954724174167e-07, "loss": 0.2533, "num_input_tokens_seen": 41002528, "step": 40755 }, { "epoch": 19.217350306459217, "grad_norm": 1.4690170288085938, "learning_rate": 2.3315592876432535e-07, "loss": 0.0818, "num_input_tokens_seen": 41007328, "step": 40760 }, { "epoch": 19.21970768505422, "grad_norm": 0.6575022339820862, "learning_rate": 2.3175650289309348e-07, "loss": 0.146, "num_input_tokens_seen": 41011584, "step": 40765 }, { "epoch": 19.22206506364922, "grad_norm": 0.7949140667915344, "learning_rate": 2.3036126986494267e-07, "loss": 0.1495, "num_input_tokens_seen": 41016544, "step": 40770 }, { "epoch": 19.224422442244226, "grad_norm": 0.16444268822669983, "learning_rate": 2.2897022991606455e-07, "loss": 0.0501, "num_input_tokens_seen": 41021056, "step": 40775 }, { "epoch": 19.226779820839226, "grad_norm": 0.5179376602172852, "learning_rate": 2.2758338328194018e-07, "loss": 0.1426, "num_input_tokens_seen": 41025504, "step": 40780 }, { "epoch": 19.22913719943423, "grad_norm": 0.2095092535018921, "learning_rate": 2.2620073019733734e-07, "loss": 0.0509, "num_input_tokens_seen": 41030944, "step": 40785 }, { "epoch": 19.23149457802923, "grad_norm": 0.14529505372047424, "learning_rate": 2.2482227089631324e-07, "loss": 0.0318, "num_input_tokens_seen": 41037408, "step": 40790 }, { "epoch": 19.233851956624235, "grad_norm": 1.6815940141677856, "learning_rate": 2.234480056122229e-07, "loss": 0.1292, "num_input_tokens_seen": 41041824, "step": 40795 }, { "epoch": 19.236209335219236, "grad_norm": 1.6663343906402588, "learning_rate": 2.2207793457770242e-07, "loss": 0.1273, "num_input_tokens_seen": 41046080, "step": 40800 }, { "epoch": 19.23856671381424, "grad_norm": 2.009370803833008, "learning_rate": 2.2071205802468299e-07, "loss": 0.0581, "num_input_tokens_seen": 41051200, "step": 40805 }, { "epoch": 19.24092409240924, "grad_norm": 2.188469409942627, "learning_rate": 2.1935037618438237e-07, "loss": 0.1902, "num_input_tokens_seen": 41057696, "step": 40810 }, { "epoch": 19.243281471004245, "grad_norm": 1.0610524415969849, "learning_rate": 2.1799288928731064e-07, "loss": 0.032, "num_input_tokens_seen": 41062048, "step": 40815 }, { "epoch": 19.245638849599246, "grad_norm": 0.13007251918315887, "learning_rate": 2.1663959756327011e-07, "loss": 0.3194, "num_input_tokens_seen": 41067104, "step": 40820 }, { "epoch": 19.247996228194246, "grad_norm": 0.6127053499221802, "learning_rate": 2.1529050124134699e-07, "loss": 0.0608, "num_input_tokens_seen": 41071392, "step": 40825 }, { "epoch": 19.25035360678925, "grad_norm": 2.4135804176330566, "learning_rate": 2.139456005499224e-07, "loss": 0.2536, "num_input_tokens_seen": 41076288, "step": 40830 }, { "epoch": 19.25271098538425, "grad_norm": 2.383636474609375, "learning_rate": 2.126048957166671e-07, "loss": 0.0556, "num_input_tokens_seen": 41081056, "step": 40835 }, { "epoch": 19.255068363979255, "grad_norm": 0.6965378522872925, "learning_rate": 2.112683869685328e-07, "loss": 0.2117, "num_input_tokens_seen": 41086080, "step": 40840 }, { "epoch": 19.257425742574256, "grad_norm": 0.17873115837574005, "learning_rate": 2.0993607453177465e-07, "loss": 0.1554, "num_input_tokens_seen": 41091616, "step": 40845 }, { "epoch": 19.25978312116926, "grad_norm": 0.10934807360172272, "learning_rate": 2.0860795863192617e-07, "loss": 0.0161, "num_input_tokens_seen": 41096192, "step": 40850 }, { "epoch": 19.26214049976426, "grad_norm": 4.15507173538208, "learning_rate": 2.0728403949381582e-07, "loss": 0.2479, "num_input_tokens_seen": 41100384, "step": 40855 }, { "epoch": 19.264497878359265, "grad_norm": 0.38535767793655396, "learning_rate": 2.0596431734156153e-07, "loss": 0.143, "num_input_tokens_seen": 41105632, "step": 40860 }, { "epoch": 19.266855256954265, "grad_norm": 0.10390966385602951, "learning_rate": 2.046487923985707e-07, "loss": 0.0435, "num_input_tokens_seen": 41111040, "step": 40865 }, { "epoch": 19.26921263554927, "grad_norm": 0.4794252812862396, "learning_rate": 2.0333746488753746e-07, "loss": 0.0722, "num_input_tokens_seen": 41115968, "step": 40870 }, { "epoch": 19.27157001414427, "grad_norm": 1.2881486415863037, "learning_rate": 2.020303350304481e-07, "loss": 0.1785, "num_input_tokens_seen": 41124576, "step": 40875 }, { "epoch": 19.273927392739274, "grad_norm": 0.01143789105117321, "learning_rate": 2.007274030485784e-07, "loss": 0.0423, "num_input_tokens_seen": 41129952, "step": 40880 }, { "epoch": 19.276284771334275, "grad_norm": 0.042589955031871796, "learning_rate": 1.994286691624908e-07, "loss": 0.1036, "num_input_tokens_seen": 41134080, "step": 40885 }, { "epoch": 19.27864214992928, "grad_norm": 0.0601181834936142, "learning_rate": 1.9813413359204003e-07, "loss": 0.0623, "num_input_tokens_seen": 41138496, "step": 40890 }, { "epoch": 19.28099952852428, "grad_norm": 1.3715283870697021, "learning_rate": 1.968437965563674e-07, "loss": 0.0723, "num_input_tokens_seen": 41142944, "step": 40895 }, { "epoch": 19.283356907119284, "grad_norm": 0.12671661376953125, "learning_rate": 1.9555765827390937e-07, "loss": 0.1165, "num_input_tokens_seen": 41147168, "step": 40900 }, { "epoch": 19.285714285714285, "grad_norm": 0.16304025053977966, "learning_rate": 1.9427571896238062e-07, "loss": 0.0436, "num_input_tokens_seen": 41152384, "step": 40905 }, { "epoch": 19.28807166430929, "grad_norm": 0.4689043462276459, "learning_rate": 1.9299797883879922e-07, "loss": 0.0393, "num_input_tokens_seen": 41157504, "step": 40910 }, { "epoch": 19.29042904290429, "grad_norm": 0.013454314321279526, "learning_rate": 1.9172443811945883e-07, "loss": 0.1263, "num_input_tokens_seen": 41161664, "step": 40915 }, { "epoch": 19.292786421499294, "grad_norm": 0.08680495619773865, "learning_rate": 1.904550970199509e-07, "loss": 0.0415, "num_input_tokens_seen": 41166304, "step": 40920 }, { "epoch": 19.295143800094294, "grad_norm": 0.452165812253952, "learning_rate": 1.8918995575515352e-07, "loss": 0.04, "num_input_tokens_seen": 41170816, "step": 40925 }, { "epoch": 19.2975011786893, "grad_norm": 0.48155543208122253, "learning_rate": 1.879290145392343e-07, "loss": 0.0793, "num_input_tokens_seen": 41175904, "step": 40930 }, { "epoch": 19.2998585572843, "grad_norm": 1.4272559881210327, "learning_rate": 1.8667227358564464e-07, "loss": 0.1983, "num_input_tokens_seen": 41182112, "step": 40935 }, { "epoch": 19.302215935879303, "grad_norm": 1.5570316314697266, "learning_rate": 1.854197331071339e-07, "loss": 0.0501, "num_input_tokens_seen": 41187200, "step": 40940 }, { "epoch": 19.304573314474304, "grad_norm": 0.18492676317691803, "learning_rate": 1.841713933157352e-07, "loss": 0.2106, "num_input_tokens_seen": 41192256, "step": 40945 }, { "epoch": 19.306930693069308, "grad_norm": 0.41558876633644104, "learning_rate": 1.8292725442277114e-07, "loss": 0.029, "num_input_tokens_seen": 41197120, "step": 40950 }, { "epoch": 19.30928807166431, "grad_norm": 1.3814154863357544, "learning_rate": 1.8168731663885108e-07, "loss": 0.1374, "num_input_tokens_seen": 41202016, "step": 40955 }, { "epoch": 19.311645450259313, "grad_norm": 0.6997277736663818, "learning_rate": 1.804515801738793e-07, "loss": 0.3278, "num_input_tokens_seen": 41206304, "step": 40960 }, { "epoch": 19.314002828854314, "grad_norm": 0.017263861373066902, "learning_rate": 1.7922004523704407e-07, "loss": 0.2318, "num_input_tokens_seen": 41210304, "step": 40965 }, { "epoch": 19.316360207449318, "grad_norm": 1.2858097553253174, "learning_rate": 1.7799271203682022e-07, "loss": 0.1904, "num_input_tokens_seen": 41215520, "step": 40970 }, { "epoch": 19.31871758604432, "grad_norm": 0.4973658323287964, "learning_rate": 1.7676958078097772e-07, "loss": 0.0779, "num_input_tokens_seen": 41220160, "step": 40975 }, { "epoch": 19.321074964639323, "grad_norm": 0.8476231694221497, "learning_rate": 1.7555065167657036e-07, "loss": 0.1231, "num_input_tokens_seen": 41225504, "step": 40980 }, { "epoch": 19.323432343234323, "grad_norm": 1.6899192333221436, "learning_rate": 1.7433592492994143e-07, "loss": 0.3679, "num_input_tokens_seen": 41229888, "step": 40985 }, { "epoch": 19.325789721829327, "grad_norm": 0.3522512912750244, "learning_rate": 1.7312540074672646e-07, "loss": 0.0813, "num_input_tokens_seen": 41234880, "step": 40990 }, { "epoch": 19.328147100424328, "grad_norm": 0.41917669773101807, "learning_rate": 1.7191907933184203e-07, "loss": 0.0495, "num_input_tokens_seen": 41239392, "step": 40995 }, { "epoch": 19.330504479019332, "grad_norm": 0.9443212151527405, "learning_rate": 1.7071696088950262e-07, "loss": 0.1059, "num_input_tokens_seen": 41244672, "step": 41000 }, { "epoch": 19.332861857614333, "grad_norm": 0.09215311706066132, "learning_rate": 1.6951904562320653e-07, "loss": 0.0744, "num_input_tokens_seen": 41249440, "step": 41005 }, { "epoch": 19.335219236209337, "grad_norm": 0.1459008753299713, "learning_rate": 1.68325333735736e-07, "loss": 0.0585, "num_input_tokens_seen": 41254944, "step": 41010 }, { "epoch": 19.337576614804338, "grad_norm": 0.9545491933822632, "learning_rate": 1.6713582542916827e-07, "loss": 0.1658, "num_input_tokens_seen": 41260416, "step": 41015 }, { "epoch": 19.33993399339934, "grad_norm": 1.434356927871704, "learning_rate": 1.6595052090486728e-07, "loss": 0.0751, "num_input_tokens_seen": 41265216, "step": 41020 }, { "epoch": 19.342291371994342, "grad_norm": 0.9963545203208923, "learning_rate": 1.6476942036348643e-07, "loss": 0.2311, "num_input_tokens_seen": 41270048, "step": 41025 }, { "epoch": 19.344648750589343, "grad_norm": 0.38749048113822937, "learning_rate": 1.6359252400496295e-07, "loss": 0.0591, "num_input_tokens_seen": 41274944, "step": 41030 }, { "epoch": 19.347006129184347, "grad_norm": 0.7729886770248413, "learning_rate": 1.624198320285264e-07, "loss": 0.0276, "num_input_tokens_seen": 41278752, "step": 41035 }, { "epoch": 19.349363507779348, "grad_norm": 1.1745586395263672, "learning_rate": 1.6125134463269576e-07, "loss": 0.0639, "num_input_tokens_seen": 41283776, "step": 41040 }, { "epoch": 19.351720886374352, "grad_norm": 1.766502857208252, "learning_rate": 1.6008706201527667e-07, "loss": 0.2, "num_input_tokens_seen": 41288896, "step": 41045 }, { "epoch": 19.354078264969353, "grad_norm": 1.090839147567749, "learning_rate": 1.5892698437335595e-07, "loss": 0.2704, "num_input_tokens_seen": 41294048, "step": 41050 }, { "epoch": 19.356435643564357, "grad_norm": 0.17040269076824188, "learning_rate": 1.5777111190332096e-07, "loss": 0.181, "num_input_tokens_seen": 41299488, "step": 41055 }, { "epoch": 19.358793022159357, "grad_norm": 0.13640405237674713, "learning_rate": 1.5661944480084011e-07, "loss": 0.083, "num_input_tokens_seen": 41304928, "step": 41060 }, { "epoch": 19.36115040075436, "grad_norm": 0.043928615748882294, "learning_rate": 1.5547198326087143e-07, "loss": 0.0402, "num_input_tokens_seen": 41309600, "step": 41065 }, { "epoch": 19.363507779349362, "grad_norm": 1.3048933744430542, "learning_rate": 1.5432872747765948e-07, "loss": 0.0771, "num_input_tokens_seen": 41314208, "step": 41070 }, { "epoch": 19.365865157944366, "grad_norm": 0.25945907831192017, "learning_rate": 1.5318967764473557e-07, "loss": 0.1108, "num_input_tokens_seen": 41319200, "step": 41075 }, { "epoch": 19.368222536539367, "grad_norm": 1.022882342338562, "learning_rate": 1.5205483395492882e-07, "loss": 0.0314, "num_input_tokens_seen": 41324928, "step": 41080 }, { "epoch": 19.37057991513437, "grad_norm": 0.9211744070053101, "learning_rate": 1.5092419660034384e-07, "loss": 0.0908, "num_input_tokens_seen": 41329024, "step": 41085 }, { "epoch": 19.372937293729372, "grad_norm": 0.29294779896736145, "learning_rate": 1.4979776577237758e-07, "loss": 0.1869, "num_input_tokens_seen": 41334368, "step": 41090 }, { "epoch": 19.375294672324376, "grad_norm": 0.00927865132689476, "learning_rate": 1.4867554166171638e-07, "loss": 0.0737, "num_input_tokens_seen": 41338560, "step": 41095 }, { "epoch": 19.377652050919377, "grad_norm": 4.186618804931641, "learning_rate": 1.4755752445833604e-07, "loss": 0.1549, "num_input_tokens_seen": 41342720, "step": 41100 }, { "epoch": 19.38000942951438, "grad_norm": 0.687281608581543, "learning_rate": 1.4644371435149629e-07, "loss": 0.1517, "num_input_tokens_seen": 41347488, "step": 41105 }, { "epoch": 19.38236680810938, "grad_norm": 2.5551722049713135, "learning_rate": 1.453341115297435e-07, "loss": 0.2377, "num_input_tokens_seen": 41352320, "step": 41110 }, { "epoch": 19.384724186704386, "grad_norm": 1.5370420217514038, "learning_rate": 1.4422871618092194e-07, "loss": 0.0963, "num_input_tokens_seen": 41357792, "step": 41115 }, { "epoch": 19.387081565299386, "grad_norm": 0.09570179134607315, "learning_rate": 1.431275284921485e-07, "loss": 0.2354, "num_input_tokens_seen": 41362240, "step": 41120 }, { "epoch": 19.38943894389439, "grad_norm": 0.5423904657363892, "learning_rate": 1.4203054864984078e-07, "loss": 0.0264, "num_input_tokens_seen": 41367936, "step": 41125 }, { "epoch": 19.39179632248939, "grad_norm": 2.90864634513855, "learning_rate": 1.4093777683969745e-07, "loss": 0.1862, "num_input_tokens_seen": 41373888, "step": 41130 }, { "epoch": 19.394153701084395, "grad_norm": 0.5726380944252014, "learning_rate": 1.3984921324670385e-07, "loss": 0.1647, "num_input_tokens_seen": 41378240, "step": 41135 }, { "epoch": 19.396511079679396, "grad_norm": 0.2465837299823761, "learning_rate": 1.387648580551376e-07, "loss": 0.0186, "num_input_tokens_seen": 41382976, "step": 41140 }, { "epoch": 19.3988684582744, "grad_norm": 0.9025940299034119, "learning_rate": 1.376847114485602e-07, "loss": 0.0914, "num_input_tokens_seen": 41388960, "step": 41145 }, { "epoch": 19.4012258368694, "grad_norm": 0.37101081013679504, "learning_rate": 1.3660877360982537e-07, "loss": 0.0631, "num_input_tokens_seen": 41395040, "step": 41150 }, { "epoch": 19.403583215464405, "grad_norm": 0.036488842219114304, "learning_rate": 1.355370447210652e-07, "loss": 0.0269, "num_input_tokens_seen": 41399488, "step": 41155 }, { "epoch": 19.405940594059405, "grad_norm": 0.3737836182117462, "learning_rate": 1.3446952496371235e-07, "loss": 0.1394, "num_input_tokens_seen": 41404192, "step": 41160 }, { "epoch": 19.40829797265441, "grad_norm": 0.05175010487437248, "learning_rate": 1.3340621451847501e-07, "loss": 0.0736, "num_input_tokens_seen": 41409184, "step": 41165 }, { "epoch": 19.41065535124941, "grad_norm": 1.2567617893218994, "learning_rate": 1.3234711356535368e-07, "loss": 0.2021, "num_input_tokens_seen": 41414848, "step": 41170 }, { "epoch": 19.413012729844414, "grad_norm": 0.02941802889108658, "learning_rate": 1.3129222228363824e-07, "loss": 0.1032, "num_input_tokens_seen": 41420864, "step": 41175 }, { "epoch": 19.415370108439415, "grad_norm": 0.3463672399520874, "learning_rate": 1.3024154085190533e-07, "loss": 0.1029, "num_input_tokens_seen": 41426400, "step": 41180 }, { "epoch": 19.41772748703442, "grad_norm": 1.157568335533142, "learning_rate": 1.2919506944801264e-07, "loss": 0.1041, "num_input_tokens_seen": 41431488, "step": 41185 }, { "epoch": 19.42008486562942, "grad_norm": 2.057882785797119, "learning_rate": 1.2815280824911013e-07, "loss": 0.109, "num_input_tokens_seen": 41436192, "step": 41190 }, { "epoch": 19.422442244224424, "grad_norm": 0.2076331377029419, "learning_rate": 1.2711475743164002e-07, "loss": 0.0936, "num_input_tokens_seen": 41441280, "step": 41195 }, { "epoch": 19.424799622819425, "grad_norm": 1.0919796228408813, "learning_rate": 1.2608091717132564e-07, "loss": 0.0868, "num_input_tokens_seen": 41447328, "step": 41200 }, { "epoch": 19.42715700141443, "grad_norm": 2.327070474624634, "learning_rate": 1.250512876431742e-07, "loss": 0.1796, "num_input_tokens_seen": 41452736, "step": 41205 }, { "epoch": 19.42951438000943, "grad_norm": 0.09029041975736618, "learning_rate": 1.2402586902148793e-07, "loss": 0.1226, "num_input_tokens_seen": 41458432, "step": 41210 }, { "epoch": 19.431871758604434, "grad_norm": 0.1468489021062851, "learning_rate": 1.23004661479853e-07, "loss": 0.1382, "num_input_tokens_seen": 41463328, "step": 41215 }, { "epoch": 19.434229137199434, "grad_norm": 0.12088951468467712, "learning_rate": 1.2198766519113947e-07, "loss": 0.049, "num_input_tokens_seen": 41467424, "step": 41220 }, { "epoch": 19.436586515794435, "grad_norm": 0.06113861873745918, "learning_rate": 1.2097488032750957e-07, "loss": 0.0379, "num_input_tokens_seen": 41474112, "step": 41225 }, { "epoch": 19.43894389438944, "grad_norm": 0.04573715478181839, "learning_rate": 1.1996630706041235e-07, "loss": 0.101, "num_input_tokens_seen": 41480256, "step": 41230 }, { "epoch": 19.44130127298444, "grad_norm": 1.097583532333374, "learning_rate": 1.189619455605806e-07, "loss": 0.1044, "num_input_tokens_seen": 41484544, "step": 41235 }, { "epoch": 19.443658651579444, "grad_norm": 0.0713472068309784, "learning_rate": 1.1796179599803391e-07, "loss": 0.0504, "num_input_tokens_seen": 41489152, "step": 41240 }, { "epoch": 19.446016030174444, "grad_norm": 0.20282526314258575, "learning_rate": 1.169658585420813e-07, "loss": 0.0986, "num_input_tokens_seen": 41495808, "step": 41245 }, { "epoch": 19.44837340876945, "grad_norm": 0.1636495143175125, "learning_rate": 1.1597413336132124e-07, "loss": 0.0535, "num_input_tokens_seen": 41501248, "step": 41250 }, { "epoch": 19.45073078736445, "grad_norm": 0.5864266753196716, "learning_rate": 1.149866206236333e-07, "loss": 0.0614, "num_input_tokens_seen": 41505504, "step": 41255 }, { "epoch": 19.453088165959453, "grad_norm": 0.04581831023097038, "learning_rate": 1.1400332049618933e-07, "loss": 0.1233, "num_input_tokens_seen": 41510400, "step": 41260 }, { "epoch": 19.455445544554454, "grad_norm": 0.03513235226273537, "learning_rate": 1.1302423314543953e-07, "loss": 0.1813, "num_input_tokens_seen": 41515584, "step": 41265 }, { "epoch": 19.45780292314946, "grad_norm": 1.2392017841339111, "learning_rate": 1.1204935873713185e-07, "loss": 0.1143, "num_input_tokens_seen": 41521632, "step": 41270 }, { "epoch": 19.46016030174446, "grad_norm": 0.20333831012248993, "learning_rate": 1.1107869743629263e-07, "loss": 0.0776, "num_input_tokens_seen": 41525824, "step": 41275 }, { "epoch": 19.462517680339463, "grad_norm": 2.045301914215088, "learning_rate": 1.1011224940724041e-07, "loss": 0.1815, "num_input_tokens_seen": 41530432, "step": 41280 }, { "epoch": 19.464875058934464, "grad_norm": 0.39067015051841736, "learning_rate": 1.0915001481358045e-07, "loss": 0.0301, "num_input_tokens_seen": 41534944, "step": 41285 }, { "epoch": 19.467232437529468, "grad_norm": 0.2685057818889618, "learning_rate": 1.0819199381819634e-07, "loss": 0.0927, "num_input_tokens_seen": 41540224, "step": 41290 }, { "epoch": 19.46958981612447, "grad_norm": 0.6984885334968567, "learning_rate": 1.07238186583275e-07, "loss": 0.0857, "num_input_tokens_seen": 41545184, "step": 41295 }, { "epoch": 19.471947194719473, "grad_norm": 0.224220409989357, "learning_rate": 1.0628859327026786e-07, "loss": 0.0767, "num_input_tokens_seen": 41549888, "step": 41300 }, { "epoch": 19.474304573314473, "grad_norm": 1.6478137969970703, "learning_rate": 1.0534321403993242e-07, "loss": 0.0711, "num_input_tokens_seen": 41555232, "step": 41305 }, { "epoch": 19.476661951909477, "grad_norm": 2.297565460205078, "learning_rate": 1.0440204905230455e-07, "loss": 0.078, "num_input_tokens_seen": 41560832, "step": 41310 }, { "epoch": 19.479019330504478, "grad_norm": 0.8831261992454529, "learning_rate": 1.0346509846670682e-07, "loss": 0.0761, "num_input_tokens_seen": 41565664, "step": 41315 }, { "epoch": 19.481376709099482, "grad_norm": 0.2150881141424179, "learning_rate": 1.0253236244174846e-07, "loss": 0.0678, "num_input_tokens_seen": 41570752, "step": 41320 }, { "epoch": 19.483734087694483, "grad_norm": 0.13457030057907104, "learning_rate": 1.0160384113532539e-07, "loss": 0.015, "num_input_tokens_seen": 41575200, "step": 41325 }, { "epoch": 19.486091466289487, "grad_norm": 0.380985826253891, "learning_rate": 1.0067953470462022e-07, "loss": 0.0761, "num_input_tokens_seen": 41579328, "step": 41330 }, { "epoch": 19.488448844884488, "grad_norm": 0.0991220772266388, "learning_rate": 9.975944330610776e-08, "loss": 0.0672, "num_input_tokens_seen": 41583840, "step": 41335 }, { "epoch": 19.490806223479492, "grad_norm": 0.1540273278951645, "learning_rate": 9.884356709553565e-08, "loss": 0.0516, "num_input_tokens_seen": 41588640, "step": 41340 }, { "epoch": 19.493163602074493, "grad_norm": 0.6118853092193604, "learning_rate": 9.793190622795211e-08, "loss": 0.1054, "num_input_tokens_seen": 41594080, "step": 41345 }, { "epoch": 19.495520980669497, "grad_norm": 0.35856908559799194, "learning_rate": 9.702446085768092e-08, "loss": 0.0476, "num_input_tokens_seen": 41598976, "step": 41350 }, { "epoch": 19.497878359264497, "grad_norm": 0.79696124792099, "learning_rate": 9.61212311383436e-08, "loss": 0.1374, "num_input_tokens_seen": 41603776, "step": 41355 }, { "epoch": 19.5002357378595, "grad_norm": 1.250878930091858, "learning_rate": 9.522221722283453e-08, "loss": 0.1554, "num_input_tokens_seen": 41608672, "step": 41360 }, { "epoch": 19.502593116454502, "grad_norm": 1.8436787128448486, "learning_rate": 9.432741926334865e-08, "loss": 0.0622, "num_input_tokens_seen": 41613920, "step": 41365 }, { "epoch": 19.504950495049506, "grad_norm": 1.3969651460647583, "learning_rate": 9.343683741135645e-08, "loss": 0.1713, "num_input_tokens_seen": 41618688, "step": 41370 }, { "epoch": 19.507307873644507, "grad_norm": 0.10487852990627289, "learning_rate": 9.255047181761512e-08, "loss": 0.0739, "num_input_tokens_seen": 41623200, "step": 41375 }, { "epoch": 19.50966525223951, "grad_norm": 0.4941263496875763, "learning_rate": 9.166832263217683e-08, "loss": 0.1483, "num_input_tokens_seen": 41628352, "step": 41380 }, { "epoch": 19.51202263083451, "grad_norm": 0.13296747207641602, "learning_rate": 9.079039000437218e-08, "loss": 0.4373, "num_input_tokens_seen": 41632672, "step": 41385 }, { "epoch": 19.514380009429516, "grad_norm": 0.9030901789665222, "learning_rate": 8.991667408281835e-08, "loss": 0.047, "num_input_tokens_seen": 41637056, "step": 41390 }, { "epoch": 19.516737388024517, "grad_norm": 0.0712956115603447, "learning_rate": 8.904717501542204e-08, "loss": 0.0142, "num_input_tokens_seen": 41643520, "step": 41395 }, { "epoch": 19.51909476661952, "grad_norm": 0.6579023003578186, "learning_rate": 8.818189294937662e-08, "loss": 0.1343, "num_input_tokens_seen": 41649056, "step": 41400 }, { "epoch": 19.52145214521452, "grad_norm": 0.4885631501674652, "learning_rate": 8.732082803115938e-08, "loss": 0.15, "num_input_tokens_seen": 41653312, "step": 41405 }, { "epoch": 19.523809523809526, "grad_norm": 0.16689451038837433, "learning_rate": 8.646398040653147e-08, "loss": 0.0161, "num_input_tokens_seen": 41658304, "step": 41410 }, { "epoch": 19.526166902404526, "grad_norm": 1.4209219217300415, "learning_rate": 8.561135022054357e-08, "loss": 0.2903, "num_input_tokens_seen": 41663616, "step": 41415 }, { "epoch": 19.528524280999527, "grad_norm": 1.8041499853134155, "learning_rate": 8.476293761753296e-08, "loss": 0.1742, "num_input_tokens_seen": 41668480, "step": 41420 }, { "epoch": 19.53088165959453, "grad_norm": 0.8110234141349792, "learning_rate": 8.391874274111811e-08, "loss": 0.0805, "num_input_tokens_seen": 41672480, "step": 41425 }, { "epoch": 19.53323903818953, "grad_norm": 0.2615642845630646, "learning_rate": 8.30787657342097e-08, "loss": 0.073, "num_input_tokens_seen": 41676640, "step": 41430 }, { "epoch": 19.535596416784536, "grad_norm": 1.2614277601242065, "learning_rate": 8.224300673900231e-08, "loss": 0.1992, "num_input_tokens_seen": 41681952, "step": 41435 }, { "epoch": 19.537953795379536, "grad_norm": 0.15909649431705475, "learning_rate": 8.141146589697445e-08, "loss": 0.0529, "num_input_tokens_seen": 41687520, "step": 41440 }, { "epoch": 19.54031117397454, "grad_norm": 0.07882297784090042, "learning_rate": 8.05841433488913e-08, "loss": 0.0966, "num_input_tokens_seen": 41692352, "step": 41445 }, { "epoch": 19.54266855256954, "grad_norm": 0.9585307240486145, "learning_rate": 7.976103923480471e-08, "loss": 0.1139, "num_input_tokens_seen": 41697728, "step": 41450 }, { "epoch": 19.545025931164545, "grad_norm": 0.09493880718946457, "learning_rate": 7.894215369405322e-08, "loss": 0.0961, "num_input_tokens_seen": 41702272, "step": 41455 }, { "epoch": 19.547383309759546, "grad_norm": 1.7592235803604126, "learning_rate": 7.812748686525928e-08, "loss": 0.0965, "num_input_tokens_seen": 41707424, "step": 41460 }, { "epoch": 19.54974068835455, "grad_norm": 0.1970023363828659, "learning_rate": 7.731703888633202e-08, "loss": 0.0929, "num_input_tokens_seen": 41712224, "step": 41465 }, { "epoch": 19.55209806694955, "grad_norm": 0.8948818445205688, "learning_rate": 7.651080989447002e-08, "loss": 0.1109, "num_input_tokens_seen": 41717536, "step": 41470 }, { "epoch": 19.554455445544555, "grad_norm": 0.9011455774307251, "learning_rate": 7.570880002614744e-08, "loss": 0.0829, "num_input_tokens_seen": 41722336, "step": 41475 }, { "epoch": 19.556812824139556, "grad_norm": 1.4748797416687012, "learning_rate": 7.491100941713625e-08, "loss": 0.2108, "num_input_tokens_seen": 41727328, "step": 41480 }, { "epoch": 19.55917020273456, "grad_norm": 0.07232166826725006, "learning_rate": 7.411743820248674e-08, "loss": 0.0448, "num_input_tokens_seen": 41733664, "step": 41485 }, { "epoch": 19.56152758132956, "grad_norm": 0.11084748059511185, "learning_rate": 7.332808651653867e-08, "loss": 0.1179, "num_input_tokens_seen": 41737888, "step": 41490 }, { "epoch": 19.563884959924565, "grad_norm": 1.1555622816085815, "learning_rate": 7.254295449291293e-08, "loss": 0.2842, "num_input_tokens_seen": 41742240, "step": 41495 }, { "epoch": 19.566242338519565, "grad_norm": 0.1338133066892624, "learning_rate": 7.176204226452266e-08, "loss": 0.1313, "num_input_tokens_seen": 41747872, "step": 41500 }, { "epoch": 19.56859971711457, "grad_norm": 0.2762347161769867, "learning_rate": 7.098534996355933e-08, "loss": 0.0909, "num_input_tokens_seen": 41754144, "step": 41505 }, { "epoch": 19.57095709570957, "grad_norm": 1.7020293474197388, "learning_rate": 7.021287772150665e-08, "loss": 0.1994, "num_input_tokens_seen": 41758528, "step": 41510 }, { "epoch": 19.573314474304574, "grad_norm": 0.631481945514679, "learning_rate": 6.944462566912945e-08, "loss": 0.1044, "num_input_tokens_seen": 41762880, "step": 41515 }, { "epoch": 19.575671852899575, "grad_norm": 2.774195909500122, "learning_rate": 6.868059393648207e-08, "loss": 0.1143, "num_input_tokens_seen": 41766464, "step": 41520 }, { "epoch": 19.57802923149458, "grad_norm": 0.1759503334760666, "learning_rate": 6.792078265289991e-08, "loss": 0.0948, "num_input_tokens_seen": 41770784, "step": 41525 }, { "epoch": 19.58038661008958, "grad_norm": 0.1640367954969406, "learning_rate": 6.716519194700511e-08, "loss": 0.065, "num_input_tokens_seen": 41776000, "step": 41530 }, { "epoch": 19.582743988684584, "grad_norm": 1.1351395845413208, "learning_rate": 6.64138219467092e-08, "loss": 0.076, "num_input_tokens_seen": 41780672, "step": 41535 }, { "epoch": 19.585101367279584, "grad_norm": 1.892451286315918, "learning_rate": 6.566667277920491e-08, "loss": 0.1318, "num_input_tokens_seen": 41785792, "step": 41540 }, { "epoch": 19.58745874587459, "grad_norm": 0.3291120231151581, "learning_rate": 6.492374457097161e-08, "loss": 0.0611, "num_input_tokens_seen": 41789504, "step": 41545 }, { "epoch": 19.58981612446959, "grad_norm": 1.5584286451339722, "learning_rate": 6.418503744777538e-08, "loss": 0.3012, "num_input_tokens_seen": 41794048, "step": 41550 }, { "epoch": 19.592173503064593, "grad_norm": 0.3686571419239044, "learning_rate": 6.345055153466616e-08, "loss": 0.106, "num_input_tokens_seen": 41799136, "step": 41555 }, { "epoch": 19.594530881659594, "grad_norm": 1.46260404586792, "learning_rate": 6.272028695598064e-08, "loss": 0.1107, "num_input_tokens_seen": 41803680, "step": 41560 }, { "epoch": 19.596888260254598, "grad_norm": 1.4321718215942383, "learning_rate": 6.199424383533937e-08, "loss": 0.0975, "num_input_tokens_seen": 41808512, "step": 41565 }, { "epoch": 19.5992456388496, "grad_norm": 0.10017196089029312, "learning_rate": 6.127242229564956e-08, "loss": 0.0805, "num_input_tokens_seen": 41813920, "step": 41570 }, { "epoch": 19.601603017444603, "grad_norm": 0.3842889964580536, "learning_rate": 6.055482245910238e-08, "loss": 0.0343, "num_input_tokens_seen": 41819616, "step": 41575 }, { "epoch": 19.603960396039604, "grad_norm": 0.13355475664138794, "learning_rate": 5.984144444717565e-08, "loss": 0.0543, "num_input_tokens_seen": 41824928, "step": 41580 }, { "epoch": 19.606317774634608, "grad_norm": 1.0290281772613525, "learning_rate": 5.913228838063389e-08, "loss": 0.2093, "num_input_tokens_seen": 41829184, "step": 41585 }, { "epoch": 19.60867515322961, "grad_norm": 0.6240811944007874, "learning_rate": 5.842735437952274e-08, "loss": 0.1008, "num_input_tokens_seen": 41834976, "step": 41590 }, { "epoch": 19.611032531824613, "grad_norm": 0.3970661461353302, "learning_rate": 5.772664256317728e-08, "loss": 0.0228, "num_input_tokens_seen": 41839936, "step": 41595 }, { "epoch": 19.613389910419613, "grad_norm": 0.02077479287981987, "learning_rate": 5.703015305021653e-08, "loss": 0.0738, "num_input_tokens_seen": 41845408, "step": 41600 }, { "epoch": 19.615747289014617, "grad_norm": 1.0585613250732422, "learning_rate": 5.633788595854062e-08, "loss": 0.0713, "num_input_tokens_seen": 41851200, "step": 41605 }, { "epoch": 19.618104667609618, "grad_norm": 2.0869927406311035, "learning_rate": 5.564984140534468e-08, "loss": 0.2838, "num_input_tokens_seen": 41856256, "step": 41610 }, { "epoch": 19.620462046204622, "grad_norm": 2.6371655464172363, "learning_rate": 5.4966019507096676e-08, "loss": 0.158, "num_input_tokens_seen": 41861408, "step": 41615 }, { "epoch": 19.622819424799623, "grad_norm": 0.45149242877960205, "learning_rate": 5.4286420379559555e-08, "loss": 0.0532, "num_input_tokens_seen": 41865824, "step": 41620 }, { "epoch": 19.625176803394623, "grad_norm": 1.635184407234192, "learning_rate": 5.36110441377774e-08, "loss": 0.1778, "num_input_tokens_seen": 41870720, "step": 41625 }, { "epoch": 19.627534181989628, "grad_norm": 0.07789687812328339, "learning_rate": 5.293989089608098e-08, "loss": 0.0789, "num_input_tokens_seen": 41876192, "step": 41630 }, { "epoch": 19.62989156058463, "grad_norm": 0.0683809146285057, "learning_rate": 5.2272960768084965e-08, "loss": 0.1392, "num_input_tokens_seen": 41880672, "step": 41635 }, { "epoch": 19.632248939179632, "grad_norm": 1.4651223421096802, "learning_rate": 5.161025386668794e-08, "loss": 0.2382, "num_input_tokens_seen": 41885472, "step": 41640 }, { "epoch": 19.634606317774633, "grad_norm": 0.11508293449878693, "learning_rate": 5.0951770304072385e-08, "loss": 0.0369, "num_input_tokens_seen": 41891712, "step": 41645 }, { "epoch": 19.636963696369637, "grad_norm": 0.7186707854270935, "learning_rate": 5.0297510191713024e-08, "loss": 0.0715, "num_input_tokens_seen": 41897184, "step": 41650 }, { "epoch": 19.639321074964638, "grad_norm": 0.019931090995669365, "learning_rate": 4.9647473640365706e-08, "loss": 0.1345, "num_input_tokens_seen": 41902016, "step": 41655 }, { "epoch": 19.641678453559642, "grad_norm": 0.08650951832532883, "learning_rate": 4.900166076006463e-08, "loss": 0.0215, "num_input_tokens_seen": 41907104, "step": 41660 }, { "epoch": 19.644035832154643, "grad_norm": 0.3594602346420288, "learning_rate": 4.836007166014178e-08, "loss": 0.0591, "num_input_tokens_seen": 41911904, "step": 41665 }, { "epoch": 19.646393210749647, "grad_norm": 1.2299411296844482, "learning_rate": 4.772270644920196e-08, "loss": 0.1307, "num_input_tokens_seen": 41916352, "step": 41670 }, { "epoch": 19.648750589344647, "grad_norm": 0.4652194678783417, "learning_rate": 4.708956523514496e-08, "loss": 0.0295, "num_input_tokens_seen": 41921632, "step": 41675 }, { "epoch": 19.65110796793965, "grad_norm": 0.025828810408711433, "learning_rate": 4.646064812514617e-08, "loss": 0.147, "num_input_tokens_seen": 41927520, "step": 41680 }, { "epoch": 19.653465346534652, "grad_norm": 0.04883844777941704, "learning_rate": 4.583595522567041e-08, "loss": 0.0229, "num_input_tokens_seen": 41932416, "step": 41685 }, { "epoch": 19.655822725129656, "grad_norm": 0.31564077734947205, "learning_rate": 4.5215486642471997e-08, "loss": 0.1017, "num_input_tokens_seen": 41937152, "step": 41690 }, { "epoch": 19.658180103724657, "grad_norm": 0.09508731961250305, "learning_rate": 4.4599242480586336e-08, "loss": 0.0307, "num_input_tokens_seen": 41942016, "step": 41695 }, { "epoch": 19.66053748231966, "grad_norm": 0.20070822536945343, "learning_rate": 4.398722284432721e-08, "loss": 0.2828, "num_input_tokens_seen": 41946464, "step": 41700 }, { "epoch": 19.662894860914662, "grad_norm": 0.03644350543618202, "learning_rate": 4.3379427837300646e-08, "loss": 0.0391, "num_input_tokens_seen": 41951904, "step": 41705 }, { "epoch": 19.665252239509666, "grad_norm": 0.11098624020814896, "learning_rate": 4.27758575624021e-08, "loss": 0.0357, "num_input_tokens_seen": 41956608, "step": 41710 }, { "epoch": 19.667609618104667, "grad_norm": 0.06569322943687439, "learning_rate": 4.217651212179985e-08, "loss": 0.0422, "num_input_tokens_seen": 41961728, "step": 41715 }, { "epoch": 19.66996699669967, "grad_norm": 0.6161563992500305, "learning_rate": 4.1581391616951623e-08, "loss": 0.1041, "num_input_tokens_seen": 41966528, "step": 41720 }, { "epoch": 19.67232437529467, "grad_norm": 2.492340564727783, "learning_rate": 4.0990496148607395e-08, "loss": 0.2226, "num_input_tokens_seen": 41971616, "step": 41725 }, { "epoch": 19.674681753889676, "grad_norm": 0.20408974587917328, "learning_rate": 4.0403825816789917e-08, "loss": 0.1611, "num_input_tokens_seen": 41976032, "step": 41730 }, { "epoch": 19.677039132484676, "grad_norm": 1.398695945739746, "learning_rate": 3.982138072081698e-08, "loss": 0.1717, "num_input_tokens_seen": 41981216, "step": 41735 }, { "epoch": 19.67939651107968, "grad_norm": 0.1958630084991455, "learning_rate": 3.924316095928193e-08, "loss": 0.0279, "num_input_tokens_seen": 41986368, "step": 41740 }, { "epoch": 19.68175388967468, "grad_norm": 2.659656524658203, "learning_rate": 3.866916663007314e-08, "loss": 0.1581, "num_input_tokens_seen": 41991040, "step": 41745 }, { "epoch": 19.684111268269685, "grad_norm": 1.4458599090576172, "learning_rate": 3.8099397830354544e-08, "loss": 0.1176, "num_input_tokens_seen": 41996864, "step": 41750 }, { "epoch": 19.686468646864686, "grad_norm": 0.7388014793395996, "learning_rate": 3.753385465657677e-08, "loss": 0.1415, "num_input_tokens_seen": 42001440, "step": 41755 }, { "epoch": 19.68882602545969, "grad_norm": 0.03778531029820442, "learning_rate": 3.6972537204479905e-08, "loss": 0.1977, "num_input_tokens_seen": 42005696, "step": 41760 }, { "epoch": 19.69118340405469, "grad_norm": 0.33563488721847534, "learning_rate": 3.641544556908516e-08, "loss": 0.0968, "num_input_tokens_seen": 42011680, "step": 41765 }, { "epoch": 19.693540782649695, "grad_norm": 0.34834882616996765, "learning_rate": 3.5862579844697654e-08, "loss": 0.2247, "num_input_tokens_seen": 42016608, "step": 41770 }, { "epoch": 19.695898161244696, "grad_norm": 0.36952003836631775, "learning_rate": 3.5313940124909184e-08, "loss": 0.0201, "num_input_tokens_seen": 42021312, "step": 41775 }, { "epoch": 19.6982555398397, "grad_norm": 2.5348925590515137, "learning_rate": 3.4769526502592684e-08, "loss": 0.4068, "num_input_tokens_seen": 42026016, "step": 41780 }, { "epoch": 19.7006129184347, "grad_norm": 0.06316391378641129, "learning_rate": 3.4229339069910546e-08, "loss": 0.0376, "num_input_tokens_seen": 42030976, "step": 41785 }, { "epoch": 19.702970297029704, "grad_norm": 1.0576450824737549, "learning_rate": 3.369337791830629e-08, "loss": 0.166, "num_input_tokens_seen": 42035328, "step": 41790 }, { "epoch": 19.705327675624705, "grad_norm": 0.6156944036483765, "learning_rate": 3.3161643138510115e-08, "loss": 0.0386, "num_input_tokens_seen": 42040640, "step": 41795 }, { "epoch": 19.70768505421971, "grad_norm": 0.9437817335128784, "learning_rate": 3.2634134820536125e-08, "loss": 0.0464, "num_input_tokens_seen": 42045120, "step": 41800 }, { "epoch": 19.71004243281471, "grad_norm": 1.283484935760498, "learning_rate": 3.211085305367956e-08, "loss": 0.0933, "num_input_tokens_seen": 42050624, "step": 41805 }, { "epoch": 19.712399811409714, "grad_norm": 0.2766997814178467, "learning_rate": 3.159179792652511e-08, "loss": 0.1601, "num_input_tokens_seen": 42056000, "step": 41810 }, { "epoch": 19.714757190004715, "grad_norm": 3.8599417209625244, "learning_rate": 3.107696952694139e-08, "loss": 0.1556, "num_input_tokens_seen": 42060832, "step": 41815 }, { "epoch": 19.71711456859972, "grad_norm": 2.574639081954956, "learning_rate": 3.056636794207812e-08, "loss": 0.0524, "num_input_tokens_seen": 42065600, "step": 41820 }, { "epoch": 19.71947194719472, "grad_norm": 0.04886529594659805, "learning_rate": 3.0059993258368945e-08, "loss": 0.0971, "num_input_tokens_seen": 42070656, "step": 41825 }, { "epoch": 19.72182932578972, "grad_norm": 0.04083738103508949, "learning_rate": 2.955784556153973e-08, "loss": 0.0987, "num_input_tokens_seen": 42076512, "step": 41830 }, { "epoch": 19.724186704384724, "grad_norm": 0.03590351715683937, "learning_rate": 2.9059924936594706e-08, "loss": 0.0832, "num_input_tokens_seen": 42082368, "step": 41835 }, { "epoch": 19.726544082979725, "grad_norm": 0.38700172305107117, "learning_rate": 2.8566231467819228e-08, "loss": 0.0707, "num_input_tokens_seen": 42087136, "step": 41840 }, { "epoch": 19.72890146157473, "grad_norm": 0.12373460829257965, "learning_rate": 2.807676523879088e-08, "loss": 0.0152, "num_input_tokens_seen": 42092192, "step": 41845 }, { "epoch": 19.73125884016973, "grad_norm": 0.040719613432884216, "learning_rate": 2.759152633236839e-08, "loss": 0.0563, "num_input_tokens_seen": 42096096, "step": 41850 }, { "epoch": 19.733616218764734, "grad_norm": 0.26704496145248413, "learning_rate": 2.71105148306916e-08, "loss": 0.1297, "num_input_tokens_seen": 42100800, "step": 41855 }, { "epoch": 19.735973597359735, "grad_norm": 1.662579894065857, "learning_rate": 2.6633730815189828e-08, "loss": 0.1482, "num_input_tokens_seen": 42106720, "step": 41860 }, { "epoch": 19.73833097595474, "grad_norm": 0.1821572482585907, "learning_rate": 2.6161174366573504e-08, "loss": 0.0151, "num_input_tokens_seen": 42111520, "step": 41865 }, { "epoch": 19.74068835454974, "grad_norm": 0.11342839151620865, "learning_rate": 2.5692845564839753e-08, "loss": 0.1066, "num_input_tokens_seen": 42116064, "step": 41870 }, { "epoch": 19.743045733144744, "grad_norm": 0.19821354746818542, "learning_rate": 2.522874448926682e-08, "loss": 0.0447, "num_input_tokens_seen": 42121632, "step": 41875 }, { "epoch": 19.745403111739744, "grad_norm": 1.1127583980560303, "learning_rate": 2.476887121841964e-08, "loss": 0.1091, "num_input_tokens_seen": 42127360, "step": 41880 }, { "epoch": 19.74776049033475, "grad_norm": 1.2695677280426025, "learning_rate": 2.4313225830147057e-08, "loss": 0.0841, "num_input_tokens_seen": 42131616, "step": 41885 }, { "epoch": 19.75011786892975, "grad_norm": 1.3740992546081543, "learning_rate": 2.3861808401581808e-08, "loss": 0.1859, "num_input_tokens_seen": 42136096, "step": 41890 }, { "epoch": 19.752475247524753, "grad_norm": 1.5721949338912964, "learning_rate": 2.3414619009143323e-08, "loss": 0.3167, "num_input_tokens_seen": 42141120, "step": 41895 }, { "epoch": 19.754832626119754, "grad_norm": 1.6186658143997192, "learning_rate": 2.297165772853216e-08, "loss": 0.2085, "num_input_tokens_seen": 42145856, "step": 41900 }, { "epoch": 19.757190004714758, "grad_norm": 1.887402057647705, "learning_rate": 2.2532924634732778e-08, "loss": 0.163, "num_input_tokens_seen": 42150848, "step": 41905 }, { "epoch": 19.75954738330976, "grad_norm": 0.9427017569541931, "learning_rate": 2.2098419802013548e-08, "loss": 0.0939, "num_input_tokens_seen": 42155744, "step": 41910 }, { "epoch": 19.761904761904763, "grad_norm": 0.03637237101793289, "learning_rate": 2.1668143303935074e-08, "loss": 0.0163, "num_input_tokens_seen": 42161920, "step": 41915 }, { "epoch": 19.764262140499763, "grad_norm": 0.3481992185115814, "learning_rate": 2.1242095213327984e-08, "loss": 0.1788, "num_input_tokens_seen": 42166688, "step": 41920 }, { "epoch": 19.766619519094768, "grad_norm": 0.08125690370798111, "learning_rate": 2.082027560232347e-08, "loss": 0.0328, "num_input_tokens_seen": 42172000, "step": 41925 }, { "epoch": 19.768976897689768, "grad_norm": 0.09682641923427582, "learning_rate": 2.040268454231997e-08, "loss": 0.013, "num_input_tokens_seen": 42176800, "step": 41930 }, { "epoch": 19.771334276284772, "grad_norm": 0.1025732234120369, "learning_rate": 1.9989322104013718e-08, "loss": 0.218, "num_input_tokens_seen": 42181920, "step": 41935 }, { "epoch": 19.773691654879773, "grad_norm": 0.6634439826011658, "learning_rate": 1.9580188357379292e-08, "loss": 0.2405, "num_input_tokens_seen": 42189568, "step": 41940 }, { "epoch": 19.776049033474777, "grad_norm": 0.12506012618541718, "learning_rate": 1.9175283371675178e-08, "loss": 0.0318, "num_input_tokens_seen": 42195712, "step": 41945 }, { "epoch": 19.778406412069778, "grad_norm": 0.03209739550948143, "learning_rate": 1.877460721544655e-08, "loss": 0.0231, "num_input_tokens_seen": 42201280, "step": 41950 }, { "epoch": 19.780763790664782, "grad_norm": 1.66559636592865, "learning_rate": 1.8378159956519703e-08, "loss": 0.1151, "num_input_tokens_seen": 42206144, "step": 41955 }, { "epoch": 19.783121169259783, "grad_norm": 0.6525774002075195, "learning_rate": 1.7985941662004848e-08, "loss": 0.0308, "num_input_tokens_seen": 42210624, "step": 41960 }, { "epoch": 19.785478547854787, "grad_norm": 1.5254530906677246, "learning_rate": 1.7597952398301644e-08, "loss": 0.1858, "num_input_tokens_seen": 42215488, "step": 41965 }, { "epoch": 19.787835926449787, "grad_norm": 1.5445992946624756, "learning_rate": 1.721419223108811e-08, "loss": 0.2158, "num_input_tokens_seen": 42219488, "step": 41970 }, { "epoch": 19.79019330504479, "grad_norm": 1.475773811340332, "learning_rate": 1.6834661225326175e-08, "loss": 0.069, "num_input_tokens_seen": 42224064, "step": 41975 }, { "epoch": 19.792550683639792, "grad_norm": 0.19986802339553833, "learning_rate": 1.6459359445267218e-08, "loss": 0.2329, "num_input_tokens_seen": 42229920, "step": 41980 }, { "epoch": 19.794908062234796, "grad_norm": 1.0780091285705566, "learning_rate": 1.6088286954443753e-08, "loss": 0.0885, "num_input_tokens_seen": 42235136, "step": 41985 }, { "epoch": 19.797265440829797, "grad_norm": 0.2878083288669586, "learning_rate": 1.5721443815666647e-08, "loss": 0.0247, "num_input_tokens_seen": 42240480, "step": 41990 }, { "epoch": 19.7996228194248, "grad_norm": 2.097078323364258, "learning_rate": 1.535883009104455e-08, "loss": 0.1072, "num_input_tokens_seen": 42245920, "step": 41995 }, { "epoch": 19.801980198019802, "grad_norm": 3.5510573387145996, "learning_rate": 1.5000445841956144e-08, "loss": 0.0666, "num_input_tokens_seen": 42250656, "step": 42000 }, { "epoch": 19.804337576614806, "grad_norm": 0.08845101296901703, "learning_rate": 1.4646291129069566e-08, "loss": 0.03, "num_input_tokens_seen": 42255776, "step": 42005 }, { "epoch": 19.806694955209807, "grad_norm": 1.1794981956481934, "learning_rate": 1.4296366012339634e-08, "loss": 0.0775, "num_input_tokens_seen": 42260288, "step": 42010 }, { "epoch": 19.809052333804807, "grad_norm": 0.5707713961601257, "learning_rate": 1.3950670551002298e-08, "loss": 0.0487, "num_input_tokens_seen": 42264960, "step": 42015 }, { "epoch": 19.81140971239981, "grad_norm": 1.8497768640518188, "learning_rate": 1.3609204803577413e-08, "loss": 0.1439, "num_input_tokens_seen": 42270496, "step": 42020 }, { "epoch": 19.813767090994816, "grad_norm": 0.1312236189842224, "learning_rate": 1.3271968827868742e-08, "loss": 0.1057, "num_input_tokens_seen": 42275904, "step": 42025 }, { "epoch": 19.816124469589816, "grad_norm": 1.2754402160644531, "learning_rate": 1.2938962680966727e-08, "loss": 0.2943, "num_input_tokens_seen": 42281504, "step": 42030 }, { "epoch": 19.818481848184817, "grad_norm": 0.031851258128881454, "learning_rate": 1.2610186419240166e-08, "loss": 0.041, "num_input_tokens_seen": 42285600, "step": 42035 }, { "epoch": 19.82083922677982, "grad_norm": 0.03917919471859932, "learning_rate": 1.2285640098347317e-08, "loss": 0.1159, "num_input_tokens_seen": 42290784, "step": 42040 }, { "epoch": 19.82319660537482, "grad_norm": 0.049911826848983765, "learning_rate": 1.1965323773230342e-08, "loss": 0.0373, "num_input_tokens_seen": 42295744, "step": 42045 }, { "epoch": 19.825553983969826, "grad_norm": 1.0581339597702026, "learning_rate": 1.1649237498109755e-08, "loss": 0.0541, "num_input_tokens_seen": 42300608, "step": 42050 }, { "epoch": 19.827911362564826, "grad_norm": 0.06717605143785477, "learning_rate": 1.1337381326495533e-08, "loss": 0.0804, "num_input_tokens_seen": 42305120, "step": 42055 }, { "epoch": 19.83026874115983, "grad_norm": 0.742065966129303, "learning_rate": 1.1029755311181556e-08, "loss": 0.0989, "num_input_tokens_seen": 42309056, "step": 42060 }, { "epoch": 19.83262611975483, "grad_norm": 0.10608500987291336, "learning_rate": 1.0726359504240058e-08, "loss": 0.0764, "num_input_tokens_seen": 42314624, "step": 42065 }, { "epoch": 19.834983498349835, "grad_norm": 0.8522493839263916, "learning_rate": 1.0427193957029958e-08, "loss": 0.0499, "num_input_tokens_seen": 42319488, "step": 42070 }, { "epoch": 19.837340876944836, "grad_norm": 0.8637657165527344, "learning_rate": 1.0132258720199627e-08, "loss": 0.1208, "num_input_tokens_seen": 42327296, "step": 42075 }, { "epoch": 19.83969825553984, "grad_norm": 0.4156038165092468, "learning_rate": 9.841553843673023e-09, "loss": 0.0791, "num_input_tokens_seen": 42331232, "step": 42080 }, { "epoch": 19.84205563413484, "grad_norm": 1.07932448387146, "learning_rate": 9.555079376663556e-09, "loss": 0.188, "num_input_tokens_seen": 42335200, "step": 42085 }, { "epoch": 19.844413012729845, "grad_norm": 0.3526338040828705, "learning_rate": 9.27283536766299e-09, "loss": 0.244, "num_input_tokens_seen": 42340512, "step": 42090 }, { "epoch": 19.846770391324846, "grad_norm": 0.7283880114555359, "learning_rate": 8.994821864455328e-09, "loss": 0.1904, "num_input_tokens_seen": 42345344, "step": 42095 }, { "epoch": 19.84912776991985, "grad_norm": 0.6026824712753296, "learning_rate": 8.721038914100144e-09, "loss": 0.0472, "num_input_tokens_seen": 42350784, "step": 42100 }, { "epoch": 19.85148514851485, "grad_norm": 0.06038690358400345, "learning_rate": 8.451486562946476e-09, "loss": 0.0871, "num_input_tokens_seen": 42354688, "step": 42105 }, { "epoch": 19.853842527109855, "grad_norm": 0.40326836705207825, "learning_rate": 8.18616485662449e-09, "loss": 0.2348, "num_input_tokens_seen": 42359936, "step": 42110 }, { "epoch": 19.856199905704855, "grad_norm": 2.4951705932617188, "learning_rate": 7.925073840045483e-09, "loss": 0.0361, "num_input_tokens_seen": 42365056, "step": 42115 }, { "epoch": 19.85855728429986, "grad_norm": 0.35090991854667664, "learning_rate": 7.668213557410209e-09, "loss": 0.1854, "num_input_tokens_seen": 42369248, "step": 42120 }, { "epoch": 19.86091466289486, "grad_norm": 0.05402063578367233, "learning_rate": 7.4155840522033285e-09, "loss": 0.0206, "num_input_tokens_seen": 42374016, "step": 42125 }, { "epoch": 19.863272041489864, "grad_norm": 0.8616724610328674, "learning_rate": 7.167185367187856e-09, "loss": 0.0815, "num_input_tokens_seen": 42378560, "step": 42130 }, { "epoch": 19.865629420084865, "grad_norm": 0.19224147498607635, "learning_rate": 6.923017544410715e-09, "loss": 0.0156, "num_input_tokens_seen": 42384704, "step": 42135 }, { "epoch": 19.86798679867987, "grad_norm": 0.34279173612594604, "learning_rate": 6.6830806252110574e-09, "loss": 0.2035, "num_input_tokens_seen": 42388992, "step": 42140 }, { "epoch": 19.87034417727487, "grad_norm": 0.04338805750012398, "learning_rate": 6.447374650203619e-09, "loss": 0.0869, "num_input_tokens_seen": 42393888, "step": 42145 }, { "epoch": 19.872701555869874, "grad_norm": 0.17252415418624878, "learning_rate": 6.215899659287039e-09, "loss": 0.0239, "num_input_tokens_seen": 42399168, "step": 42150 }, { "epoch": 19.875058934464874, "grad_norm": 1.7630460262298584, "learning_rate": 5.988655691649414e-09, "loss": 0.0961, "num_input_tokens_seen": 42404096, "step": 42155 }, { "epoch": 19.87741631305988, "grad_norm": 0.5842421650886536, "learning_rate": 5.765642785759973e-09, "loss": 0.1537, "num_input_tokens_seen": 42408864, "step": 42160 }, { "epoch": 19.87977369165488, "grad_norm": 2.9445407390594482, "learning_rate": 5.546860979366298e-09, "loss": 0.1442, "num_input_tokens_seen": 42414944, "step": 42165 }, { "epoch": 19.882131070249883, "grad_norm": 1.440362811088562, "learning_rate": 5.33231030951098e-09, "loss": 0.3212, "num_input_tokens_seen": 42419424, "step": 42170 }, { "epoch": 19.884488448844884, "grad_norm": 0.16750891506671906, "learning_rate": 5.121990812506638e-09, "loss": 0.1096, "num_input_tokens_seen": 42424640, "step": 42175 }, { "epoch": 19.88684582743989, "grad_norm": 0.05093478038907051, "learning_rate": 4.915902523960902e-09, "loss": 0.0654, "num_input_tokens_seen": 42429824, "step": 42180 }, { "epoch": 19.88920320603489, "grad_norm": 1.8400213718414307, "learning_rate": 4.7140454787625295e-09, "loss": 0.067, "num_input_tokens_seen": 42435008, "step": 42185 }, { "epoch": 19.891560584629893, "grad_norm": 0.17884498834609985, "learning_rate": 4.5164197110786346e-09, "loss": 0.0189, "num_input_tokens_seen": 42439328, "step": 42190 }, { "epoch": 19.893917963224894, "grad_norm": 0.6761934161186218, "learning_rate": 4.323025254368562e-09, "loss": 0.0754, "num_input_tokens_seen": 42444672, "step": 42195 }, { "epoch": 19.896275341819898, "grad_norm": 0.019983939826488495, "learning_rate": 4.133862141364464e-09, "loss": 0.0755, "num_input_tokens_seen": 42451552, "step": 42200 }, { "epoch": 19.8986327204149, "grad_norm": 0.09764651209115982, "learning_rate": 3.948930404093498e-09, "loss": 0.0902, "num_input_tokens_seen": 42456704, "step": 42205 }, { "epoch": 19.900990099009903, "grad_norm": 0.34022292494773865, "learning_rate": 3.768230073861179e-09, "loss": 0.0517, "num_input_tokens_seen": 42460960, "step": 42210 }, { "epoch": 19.903347477604903, "grad_norm": 0.5109573602676392, "learning_rate": 3.591761181251374e-09, "loss": 0.1933, "num_input_tokens_seen": 42465344, "step": 42215 }, { "epoch": 19.905704856199904, "grad_norm": 0.38682472705841064, "learning_rate": 3.419523756145737e-09, "loss": 0.0237, "num_input_tokens_seen": 42471168, "step": 42220 }, { "epoch": 19.908062234794908, "grad_norm": 0.7055312395095825, "learning_rate": 3.2515178276959492e-09, "loss": 0.0227, "num_input_tokens_seen": 42475520, "step": 42225 }, { "epoch": 19.91041961338991, "grad_norm": 0.18405966460704803, "learning_rate": 3.087743424343148e-09, "loss": 0.062, "num_input_tokens_seen": 42482592, "step": 42230 }, { "epoch": 19.912776991984913, "grad_norm": 1.0705978870391846, "learning_rate": 2.928200573809603e-09, "loss": 0.0593, "num_input_tokens_seen": 42488896, "step": 42235 }, { "epoch": 19.915134370579914, "grad_norm": 0.1221209317445755, "learning_rate": 2.772889303109816e-09, "loss": 0.0169, "num_input_tokens_seen": 42493728, "step": 42240 }, { "epoch": 19.917491749174918, "grad_norm": 0.1339026391506195, "learning_rate": 2.6218096385283164e-09, "loss": 0.0251, "num_input_tokens_seen": 42498976, "step": 42245 }, { "epoch": 19.91984912776992, "grad_norm": 0.18736714124679565, "learning_rate": 2.4749616056446433e-09, "loss": 0.066, "num_input_tokens_seen": 42502400, "step": 42250 }, { "epoch": 19.922206506364923, "grad_norm": 0.012059965170919895, "learning_rate": 2.3323452293139147e-09, "loss": 0.0596, "num_input_tokens_seen": 42507712, "step": 42255 }, { "epoch": 19.924563884959923, "grad_norm": 0.3636368215084076, "learning_rate": 2.193960533683481e-09, "loss": 0.1051, "num_input_tokens_seen": 42511584, "step": 42260 }, { "epoch": 19.926921263554927, "grad_norm": 0.48788946866989136, "learning_rate": 2.059807542176273e-09, "loss": 0.0778, "num_input_tokens_seen": 42516384, "step": 42265 }, { "epoch": 19.929278642149928, "grad_norm": 0.447113573551178, "learning_rate": 1.9298862775019024e-09, "loss": 0.0808, "num_input_tokens_seen": 42520928, "step": 42270 }, { "epoch": 19.931636020744932, "grad_norm": 1.3967069387435913, "learning_rate": 1.8041967616566623e-09, "loss": 0.1167, "num_input_tokens_seen": 42525888, "step": 42275 }, { "epoch": 19.933993399339933, "grad_norm": 0.5167384147644043, "learning_rate": 1.6827390159124267e-09, "loss": 0.1305, "num_input_tokens_seen": 42532128, "step": 42280 }, { "epoch": 19.936350777934937, "grad_norm": 0.061752550303936005, "learning_rate": 1.5655130608360768e-09, "loss": 0.0612, "num_input_tokens_seen": 42537344, "step": 42285 }, { "epoch": 19.938708156529938, "grad_norm": 0.8789992332458496, "learning_rate": 1.452518916267298e-09, "loss": 0.119, "num_input_tokens_seen": 42542688, "step": 42290 }, { "epoch": 19.94106553512494, "grad_norm": 2.3629231452941895, "learning_rate": 1.3437566013380088e-09, "loss": 0.1853, "num_input_tokens_seen": 42548480, "step": 42295 }, { "epoch": 19.943422913719942, "grad_norm": 0.25456684827804565, "learning_rate": 1.2392261344557066e-09, "loss": 0.1631, "num_input_tokens_seen": 42553120, "step": 42300 }, { "epoch": 19.945780292314947, "grad_norm": 0.7663536667823792, "learning_rate": 1.1389275333173465e-09, "loss": 0.0843, "num_input_tokens_seen": 42557344, "step": 42305 }, { "epoch": 19.948137670909947, "grad_norm": 0.6040956974029541, "learning_rate": 1.0428608149037899e-09, "loss": 0.1801, "num_input_tokens_seen": 42563232, "step": 42310 }, { "epoch": 19.95049504950495, "grad_norm": 0.9828457236289978, "learning_rate": 9.510259954742529e-10, "loss": 0.2503, "num_input_tokens_seen": 42568800, "step": 42315 }, { "epoch": 19.952852428099952, "grad_norm": 0.25570350885391235, "learning_rate": 8.634230905774088e-10, "loss": 0.0576, "num_input_tokens_seen": 42574432, "step": 42320 }, { "epoch": 19.955209806694956, "grad_norm": 2.1185402870178223, "learning_rate": 7.800521150430617e-10, "loss": 0.1897, "num_input_tokens_seen": 42580352, "step": 42325 }, { "epoch": 19.957567185289957, "grad_norm": 0.029966074973344803, "learning_rate": 7.009130829793709e-10, "loss": 0.0891, "num_input_tokens_seen": 42584992, "step": 42330 }, { "epoch": 19.95992456388496, "grad_norm": 1.506759524345398, "learning_rate": 6.260060077922791e-10, "loss": 0.2233, "num_input_tokens_seen": 42589920, "step": 42335 }, { "epoch": 19.96228194247996, "grad_norm": 0.017235860228538513, "learning_rate": 5.553309021522069e-10, "loss": 0.1514, "num_input_tokens_seen": 42595488, "step": 42340 }, { "epoch": 19.964639321074966, "grad_norm": 0.09101032465696335, "learning_rate": 4.888877780301338e-10, "loss": 0.0627, "num_input_tokens_seen": 42599520, "step": 42345 }, { "epoch": 19.966996699669966, "grad_norm": 1.4629334211349487, "learning_rate": 4.266766466726191e-10, "loss": 0.2122, "num_input_tokens_seen": 42604736, "step": 42350 }, { "epoch": 19.96935407826497, "grad_norm": 0.01169714704155922, "learning_rate": 3.6869751861012827e-10, "loss": 0.0457, "num_input_tokens_seen": 42610496, "step": 42355 }, { "epoch": 19.97171145685997, "grad_norm": 2.851914644241333, "learning_rate": 3.1495040365703275e-10, "loss": 0.0513, "num_input_tokens_seen": 42616320, "step": 42360 }, { "epoch": 19.974068835454975, "grad_norm": 0.45481687784194946, "learning_rate": 2.6543531091161033e-10, "loss": 0.0555, "num_input_tokens_seen": 42621120, "step": 42365 }, { "epoch": 19.976426214049976, "grad_norm": 0.6098357439041138, "learning_rate": 2.2015224875882035e-10, "loss": 0.1259, "num_input_tokens_seen": 42625920, "step": 42370 }, { "epoch": 19.97878359264498, "grad_norm": 2.1139872074127197, "learning_rate": 1.7910122485920167e-10, "loss": 0.19, "num_input_tokens_seen": 42629920, "step": 42375 }, { "epoch": 19.98114097123998, "grad_norm": 0.24733735620975494, "learning_rate": 1.4228224616830154e-10, "loss": 0.0506, "num_input_tokens_seen": 42634688, "step": 42380 }, { "epoch": 19.983498349834985, "grad_norm": 1.7880845069885254, "learning_rate": 1.0969531891447115e-10, "loss": 0.1243, "num_input_tokens_seen": 42641056, "step": 42385 }, { "epoch": 19.985855728429986, "grad_norm": 1.4931138753890991, "learning_rate": 8.134044861274337e-11, "loss": 0.1045, "num_input_tokens_seen": 42646016, "step": 42390 }, { "epoch": 19.98821310702499, "grad_norm": 0.0819353461265564, "learning_rate": 5.7217640067608326e-11, "loss": 0.1725, "num_input_tokens_seen": 42651232, "step": 42395 }, { "epoch": 19.99057048561999, "grad_norm": 1.33394455909729, "learning_rate": 3.7326897361911197e-11, "loss": 0.1962, "num_input_tokens_seen": 42656512, "step": 42400 }, { "epoch": 19.992927864214995, "grad_norm": 0.35177135467529297, "learning_rate": 2.1668223859627744e-11, "loss": 0.0387, "num_input_tokens_seen": 42661088, "step": 42405 }, { "epoch": 19.995285242809995, "grad_norm": 0.835017740726471, "learning_rate": 1.0241622214191005e-11, "loss": 0.0857, "num_input_tokens_seen": 42666496, "step": 42410 }, { "epoch": 19.997642621405, "grad_norm": 1.4513460397720337, "learning_rate": 3.0470943573890354e-12, "loss": 0.1275, "num_input_tokens_seen": 42673888, "step": 42415 }, { "epoch": 20.0, "grad_norm": 0.06102943420410156, "learning_rate": 8.464151046716495e-14, "loss": 0.0778, "num_input_tokens_seen": 42678144, "step": 42420 }, { "epoch": 20.0, "eval_loss": 0.15385045111179352, "eval_runtime": 15.2093, "eval_samples_per_second": 62.001, "eval_steps_per_second": 15.517, "num_input_tokens_seen": 42678144, "step": 42420 }, { "epoch": 20.0, "num_input_tokens_seen": 42678144, "step": 42420, "total_flos": 1.921827880351826e+18, "train_loss": 0.14184911905681186, "train_runtime": 9228.029, "train_samples_per_second": 18.387, "train_steps_per_second": 4.597 } ], "logging_steps": 5, "max_steps": 42420, "num_input_tokens_seen": 42678144, "num_train_epochs": 20, "save_steps": 2121, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.921827880351826e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }