{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3997902739546467, "eval_steps": 500, "global_step": 6100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.553938917289291e-05, "grad_norm": 0.77734375, "learning_rate": 0.0, "loss": 1.7594, "num_input_tokens_seen": 65536, "step": 1, "train_runtime": 16.9683, "train_tokens_per_second": 3862.269 }, { "epoch": 0.0006553938917289291, "grad_norm": 0.765625, "learning_rate": 9.846827133479213e-07, "loss": 1.7397, "num_input_tokens_seen": 655360, "step": 10, "train_runtime": 61.4988, "train_tokens_per_second": 10656.476 }, { "epoch": 0.0013107877834578582, "grad_norm": 0.78515625, "learning_rate": 2.078774617067834e-06, "loss": 1.7428, "num_input_tokens_seen": 1310720, "step": 20, "train_runtime": 112.5364, "train_tokens_per_second": 11647.075 }, { "epoch": 0.001966181675186787, "grad_norm": 0.828125, "learning_rate": 3.1728665207877464e-06, "loss": 1.7516, "num_input_tokens_seen": 1966080, "step": 30, "train_runtime": 161.2864, "train_tokens_per_second": 12189.996 }, { "epoch": 0.0026215755669157163, "grad_norm": 0.69140625, "learning_rate": 4.266958424507659e-06, "loss": 1.7504, "num_input_tokens_seen": 2621440, "step": 40, "train_runtime": 211.1557, "train_tokens_per_second": 12414.725 }, { "epoch": 0.0032769694586446455, "grad_norm": 0.79296875, "learning_rate": 5.361050328227572e-06, "loss": 1.7268, "num_input_tokens_seen": 3276800, "step": 50, "train_runtime": 260.4769, "train_tokens_per_second": 12580.002 }, { "epoch": 0.003932363350373574, "grad_norm": 0.7578125, "learning_rate": 6.455142231947483e-06, "loss": 1.7374, "num_input_tokens_seen": 3932160, "step": 60, "train_runtime": 309.3237, "train_tokens_per_second": 12712.122 }, { "epoch": 0.0045877572421025034, "grad_norm": 0.7734375, "learning_rate": 7.549234135667396e-06, "loss": 1.7397, "num_input_tokens_seen": 4587520, "step": 70, "train_runtime": 358.6518, "train_tokens_per_second": 12791.013 }, { "epoch": 0.005243151133831433, "grad_norm": 0.78125, "learning_rate": 8.643326039387308e-06, "loss": 1.7328, "num_input_tokens_seen": 5242880, "step": 80, "train_runtime": 407.7327, "train_tokens_per_second": 12858.62 }, { "epoch": 0.005898545025560362, "grad_norm": 0.78125, "learning_rate": 9.737417943107221e-06, "loss": 1.7383, "num_input_tokens_seen": 5898240, "step": 90, "train_runtime": 456.949, "train_tokens_per_second": 12907.875 }, { "epoch": 0.006553938917289291, "grad_norm": 0.7421875, "learning_rate": 1.0831509846827134e-05, "loss": 1.716, "num_input_tokens_seen": 6553600, "step": 100, "train_runtime": 505.9562, "train_tokens_per_second": 12952.901 }, { "epoch": 0.00720933280901822, "grad_norm": 0.71875, "learning_rate": 1.1925601750547047e-05, "loss": 1.7308, "num_input_tokens_seen": 7208960, "step": 110, "train_runtime": 556.0224, "train_tokens_per_second": 12965.232 }, { "epoch": 0.007864726700747149, "grad_norm": 0.7421875, "learning_rate": 1.301969365426696e-05, "loss": 1.7395, "num_input_tokens_seen": 7864320, "step": 120, "train_runtime": 604.0115, "train_tokens_per_second": 13020.149 }, { "epoch": 0.008520120592476078, "grad_norm": 0.72265625, "learning_rate": 1.4113785557986872e-05, "loss": 1.7173, "num_input_tokens_seen": 8519680, "step": 130, "train_runtime": 654.5151, "train_tokens_per_second": 13016.781 }, { "epoch": 0.009175514484205007, "grad_norm": 0.75390625, "learning_rate": 1.5207877461706785e-05, "loss": 1.7237, "num_input_tokens_seen": 9175040, "step": 140, "train_runtime": 701.7059, "train_tokens_per_second": 13075.336 }, { "epoch": 0.009830908375933936, "grad_norm": 0.64453125, "learning_rate": 1.6301969365426696e-05, "loss": 1.7206, "num_input_tokens_seen": 9830400, "step": 150, "train_runtime": 752.7316, "train_tokens_per_second": 13059.635 }, { "epoch": 0.010486302267662865, "grad_norm": 0.64453125, "learning_rate": 1.739606126914661e-05, "loss": 1.7134, "num_input_tokens_seen": 10485760, "step": 160, "train_runtime": 800.379, "train_tokens_per_second": 13100.993 }, { "epoch": 0.011141696159391794, "grad_norm": 0.61328125, "learning_rate": 1.8490153172866522e-05, "loss": 1.7236, "num_input_tokens_seen": 11141120, "step": 170, "train_runtime": 851.4503, "train_tokens_per_second": 13084.874 }, { "epoch": 0.011797090051120724, "grad_norm": 0.64453125, "learning_rate": 1.9584245076586437e-05, "loss": 1.7292, "num_input_tokens_seen": 11796480, "step": 180, "train_runtime": 898.766, "train_tokens_per_second": 13125.196 }, { "epoch": 0.012452483942849653, "grad_norm": 0.6171875, "learning_rate": 2.0678336980306344e-05, "loss": 1.7169, "num_input_tokens_seen": 12451840, "step": 190, "train_runtime": 949.8294, "train_tokens_per_second": 13109.555 }, { "epoch": 0.013107877834578582, "grad_norm": 0.68359375, "learning_rate": 2.177242888402626e-05, "loss": 1.7011, "num_input_tokens_seen": 13107200, "step": 200, "train_runtime": 997.2935, "train_tokens_per_second": 13142.771 }, { "epoch": 0.013763271726307511, "grad_norm": 0.6015625, "learning_rate": 2.286652078774617e-05, "loss": 1.6889, "num_input_tokens_seen": 13762560, "step": 210, "train_runtime": 1048.2353, "train_tokens_per_second": 13129.266 }, { "epoch": 0.01441866561803644, "grad_norm": 0.5546875, "learning_rate": 2.3960612691466082e-05, "loss": 1.6917, "num_input_tokens_seen": 14417920, "step": 220, "train_runtime": 1095.9031, "train_tokens_per_second": 13156.2 }, { "epoch": 0.01507405950976537, "grad_norm": 0.54296875, "learning_rate": 2.5054704595185996e-05, "loss": 1.6929, "num_input_tokens_seen": 15073280, "step": 230, "train_runtime": 1146.8841, "train_tokens_per_second": 13142.81 }, { "epoch": 0.015729453401494297, "grad_norm": 0.54296875, "learning_rate": 2.6148796498905908e-05, "loss": 1.6834, "num_input_tokens_seen": 15728640, "step": 240, "train_runtime": 1194.1874, "train_tokens_per_second": 13170.998 }, { "epoch": 0.016384847293223228, "grad_norm": 0.48828125, "learning_rate": 2.7242888402625822e-05, "loss": 1.6825, "num_input_tokens_seen": 16384000, "step": 250, "train_runtime": 1244.9675, "train_tokens_per_second": 13160.183 }, { "epoch": 0.017040241184952155, "grad_norm": 0.47265625, "learning_rate": 2.8336980306345734e-05, "loss": 1.6751, "num_input_tokens_seen": 17039360, "step": 260, "train_runtime": 1291.9309, "train_tokens_per_second": 13189.064 }, { "epoch": 0.017695635076681086, "grad_norm": 0.458984375, "learning_rate": 2.9431072210065645e-05, "loss": 1.6754, "num_input_tokens_seen": 17694720, "step": 270, "train_runtime": 1342.4301, "train_tokens_per_second": 13181.111 }, { "epoch": 0.018351028968410014, "grad_norm": 0.431640625, "learning_rate": 3.052516411378556e-05, "loss": 1.6772, "num_input_tokens_seen": 18350080, "step": 280, "train_runtime": 1390.1212, "train_tokens_per_second": 13200.346 }, { "epoch": 0.019006422860138945, "grad_norm": 0.38671875, "learning_rate": 3.161925601750547e-05, "loss": 1.6482, "num_input_tokens_seen": 19005440, "step": 290, "train_runtime": 1441.2385, "train_tokens_per_second": 13186.88 }, { "epoch": 0.019661816751867872, "grad_norm": 0.3828125, "learning_rate": 3.271334792122538e-05, "loss": 1.6574, "num_input_tokens_seen": 19660800, "step": 300, "train_runtime": 1488.8948, "train_tokens_per_second": 13204.962 }, { "epoch": 0.020317210643596803, "grad_norm": 0.3515625, "learning_rate": 3.38074398249453e-05, "loss": 1.6508, "num_input_tokens_seen": 20316160, "step": 310, "train_runtime": 1539.8026, "train_tokens_per_second": 13194.003 }, { "epoch": 0.02097260453532573, "grad_norm": 0.34765625, "learning_rate": 3.490153172866521e-05, "loss": 1.6411, "num_input_tokens_seen": 20971520, "step": 320, "train_runtime": 1587.0427, "train_tokens_per_second": 13214.213 }, { "epoch": 0.02162799842705466, "grad_norm": 0.361328125, "learning_rate": 3.599562363238512e-05, "loss": 1.648, "num_input_tokens_seen": 21626880, "step": 330, "train_runtime": 1638.1822, "train_tokens_per_second": 13201.755 }, { "epoch": 0.02228339231878359, "grad_norm": 0.3046875, "learning_rate": 3.7089715536105034e-05, "loss": 1.6634, "num_input_tokens_seen": 22282240, "step": 340, "train_runtime": 1685.7785, "train_tokens_per_second": 13217.775 }, { "epoch": 0.022938786210512516, "grad_norm": 0.310546875, "learning_rate": 3.8183807439824946e-05, "loss": 1.6493, "num_input_tokens_seen": 22937600, "step": 350, "train_runtime": 1736.8638, "train_tokens_per_second": 13206.332 }, { "epoch": 0.023594180102241447, "grad_norm": 0.314453125, "learning_rate": 3.9277899343544864e-05, "loss": 1.6344, "num_input_tokens_seen": 23592960, "step": 360, "train_runtime": 1784.2444, "train_tokens_per_second": 13222.942 }, { "epoch": 0.024249573993970375, "grad_norm": 0.283203125, "learning_rate": 4.0371991247264775e-05, "loss": 1.65, "num_input_tokens_seen": 24248320, "step": 370, "train_runtime": 1835.3058, "train_tokens_per_second": 13212.142 }, { "epoch": 0.024904967885699306, "grad_norm": 0.3125, "learning_rate": 4.1466083150984686e-05, "loss": 1.6419, "num_input_tokens_seen": 24903680, "step": 380, "train_runtime": 1882.6326, "train_tokens_per_second": 13228.115 }, { "epoch": 0.025560361777428233, "grad_norm": 0.375, "learning_rate": 4.25601750547046e-05, "loss": 1.6444, "num_input_tokens_seen": 25559040, "step": 390, "train_runtime": 1933.8864, "train_tokens_per_second": 13216.412 }, { "epoch": 0.026215755669157164, "grad_norm": 0.326171875, "learning_rate": 4.365426695842451e-05, "loss": 1.6567, "num_input_tokens_seen": 26214400, "step": 400, "train_runtime": 1981.6991, "train_tokens_per_second": 13228.245 }, { "epoch": 0.02687114956088609, "grad_norm": 0.26171875, "learning_rate": 4.474835886214443e-05, "loss": 1.638, "num_input_tokens_seen": 26869760, "step": 410, "train_runtime": 2032.6931, "train_tokens_per_second": 13218.798 }, { "epoch": 0.027526543452615022, "grad_norm": 0.287109375, "learning_rate": 4.584245076586434e-05, "loss": 1.6295, "num_input_tokens_seen": 27525120, "step": 420, "train_runtime": 2079.8298, "train_tokens_per_second": 13234.314 }, { "epoch": 0.02818193734434395, "grad_norm": 0.3359375, "learning_rate": 4.693654266958425e-05, "loss": 1.615, "num_input_tokens_seen": 28180480, "step": 430, "train_runtime": 2130.844, "train_tokens_per_second": 13225.032 }, { "epoch": 0.02883733123607288, "grad_norm": 0.314453125, "learning_rate": 4.803063457330416e-05, "loss": 1.6198, "num_input_tokens_seen": 28835840, "step": 440, "train_runtime": 2178.6516, "train_tokens_per_second": 13235.636 }, { "epoch": 0.029492725127801808, "grad_norm": 0.27734375, "learning_rate": 4.912472647702407e-05, "loss": 1.6351, "num_input_tokens_seen": 29491200, "step": 450, "train_runtime": 2229.5438, "train_tokens_per_second": 13227.46 }, { "epoch": 0.03014811901953074, "grad_norm": 0.2314453125, "learning_rate": 4.999999774738161e-05, "loss": 1.6226, "num_input_tokens_seen": 30146560, "step": 460, "train_runtime": 2278.2951, "train_tokens_per_second": 13232.07 }, { "epoch": 0.030803512911259667, "grad_norm": 0.3359375, "learning_rate": 4.999991890578046e-05, "loss": 1.6168, "num_input_tokens_seen": 30801920, "step": 470, "train_runtime": 2329.1834, "train_tokens_per_second": 13224.343 }, { "epoch": 0.031458906802988594, "grad_norm": 0.2734375, "learning_rate": 4.999972743366559e-05, "loss": 1.608, "num_input_tokens_seen": 31457280, "step": 480, "train_runtime": 2376.2953, "train_tokens_per_second": 13237.95 }, { "epoch": 0.032114300694717525, "grad_norm": 0.2890625, "learning_rate": 4.9999423331899625e-05, "loss": 1.5878, "num_input_tokens_seen": 32112640, "step": 490, "train_runtime": 2426.6251, "train_tokens_per_second": 13233.458 }, { "epoch": 0.032769694586446456, "grad_norm": 0.275390625, "learning_rate": 4.99990066018526e-05, "loss": 1.5832, "num_input_tokens_seen": 32768000, "step": 500, "train_runtime": 2474.981, "train_tokens_per_second": 13239.698 }, { "epoch": 0.03342508847817539, "grad_norm": 0.255859375, "learning_rate": 4.9998477245402e-05, "loss": 1.602, "num_input_tokens_seen": 33423360, "step": 510, "train_runtime": 2525.3117, "train_tokens_per_second": 13235.34 }, { "epoch": 0.03408048236990431, "grad_norm": 0.263671875, "learning_rate": 4.999783526493269e-05, "loss": 1.5967, "num_input_tokens_seen": 34078720, "step": 520, "train_runtime": 2574.426, "train_tokens_per_second": 13237.405 }, { "epoch": 0.03473587626163324, "grad_norm": 0.228515625, "learning_rate": 4.999708066333695e-05, "loss": 1.608, "num_input_tokens_seen": 34734080, "step": 530, "train_runtime": 2623.912, "train_tokens_per_second": 13237.517 }, { "epoch": 0.03539127015336217, "grad_norm": 0.2490234375, "learning_rate": 4.999621344401443e-05, "loss": 1.6062, "num_input_tokens_seen": 35389440, "step": 540, "train_runtime": 2673.0704, "train_tokens_per_second": 13239.247 }, { "epoch": 0.0360466640450911, "grad_norm": 0.3125, "learning_rate": 4.999523361087216e-05, "loss": 1.5916, "num_input_tokens_seen": 36044800, "step": 550, "train_runtime": 2722.5066, "train_tokens_per_second": 13239.564 }, { "epoch": 0.03670205793682003, "grad_norm": 0.251953125, "learning_rate": 4.999414116832452e-05, "loss": 1.6093, "num_input_tokens_seen": 36700160, "step": 560, "train_runtime": 2771.7839, "train_tokens_per_second": 13240.628 }, { "epoch": 0.03735745182854896, "grad_norm": 0.2421875, "learning_rate": 4.999293612129321e-05, "loss": 1.6012, "num_input_tokens_seen": 37355520, "step": 570, "train_runtime": 2820.9186, "train_tokens_per_second": 13242.325 }, { "epoch": 0.03801284572027789, "grad_norm": 0.25, "learning_rate": 4.9991618475207276e-05, "loss": 1.588, "num_input_tokens_seen": 38010880, "step": 580, "train_runtime": 2869.836, "train_tokens_per_second": 13244.966 }, { "epoch": 0.03866823961200681, "grad_norm": 0.271484375, "learning_rate": 4.9990188236003e-05, "loss": 1.595, "num_input_tokens_seen": 38666240, "step": 590, "train_runtime": 2918.5803, "train_tokens_per_second": 13248.304 }, { "epoch": 0.039323633503735744, "grad_norm": 0.255859375, "learning_rate": 4.9988645410123955e-05, "loss": 1.6085, "num_input_tokens_seen": 39321600, "step": 600, "train_runtime": 2968.4948, "train_tokens_per_second": 13246.309 }, { "epoch": 0.039979027395464675, "grad_norm": 0.265625, "learning_rate": 4.998699000452093e-05, "loss": 1.6159, "num_input_tokens_seen": 39976960, "step": 610, "train_runtime": 3017.3526, "train_tokens_per_second": 13249.019 }, { "epoch": 0.040634421287193606, "grad_norm": 0.21875, "learning_rate": 4.998522202665192e-05, "loss": 1.5972, "num_input_tokens_seen": 40632320, "step": 620, "train_runtime": 3068.1767, "train_tokens_per_second": 13243.148 }, { "epoch": 0.04128981517892253, "grad_norm": 0.265625, "learning_rate": 4.998334148448208e-05, "loss": 1.5949, "num_input_tokens_seen": 41287680, "step": 630, "train_runtime": 3116.1224, "train_tokens_per_second": 13249.698 }, { "epoch": 0.04194520907065146, "grad_norm": 0.1923828125, "learning_rate": 4.99813483864837e-05, "loss": 1.6048, "num_input_tokens_seen": 41943040, "step": 640, "train_runtime": 3167.1611, "train_tokens_per_second": 13243.103 }, { "epoch": 0.04260060296238039, "grad_norm": 0.2255859375, "learning_rate": 4.997924274163614e-05, "loss": 1.6133, "num_input_tokens_seen": 42598400, "step": 650, "train_runtime": 3214.5588, "train_tokens_per_second": 13251.71 }, { "epoch": 0.04325599685410932, "grad_norm": 0.2041015625, "learning_rate": 4.997702455942584e-05, "loss": 1.6001, "num_input_tokens_seen": 43253760, "step": 660, "train_runtime": 3266.3916, "train_tokens_per_second": 13242.062 }, { "epoch": 0.04391139074583825, "grad_norm": 0.265625, "learning_rate": 4.997469384984623e-05, "loss": 1.5868, "num_input_tokens_seen": 43909120, "step": 670, "train_runtime": 3312.5608, "train_tokens_per_second": 13255.34 }, { "epoch": 0.04456678463756718, "grad_norm": 0.2275390625, "learning_rate": 4.99722506233977e-05, "loss": 1.5971, "num_input_tokens_seen": 44564480, "step": 680, "train_runtime": 3363.3933, "train_tokens_per_second": 13249.857 }, { "epoch": 0.04522217852929611, "grad_norm": 0.2236328125, "learning_rate": 4.996969489108757e-05, "loss": 1.5839, "num_input_tokens_seen": 45219840, "step": 690, "train_runtime": 3410.6096, "train_tokens_per_second": 13258.58 }, { "epoch": 0.04587757242102503, "grad_norm": 0.193359375, "learning_rate": 4.996702666443e-05, "loss": 1.5924, "num_input_tokens_seen": 45875200, "step": 700, "train_runtime": 3461.4336, "train_tokens_per_second": 13253.237 }, { "epoch": 0.046532966312753964, "grad_norm": 0.2490234375, "learning_rate": 4.996424595544599e-05, "loss": 1.5904, "num_input_tokens_seen": 46530560, "step": 710, "train_runtime": 3508.8687, "train_tokens_per_second": 13260.844 }, { "epoch": 0.047188360204482895, "grad_norm": 0.216796875, "learning_rate": 4.996135277666328e-05, "loss": 1.5876, "num_input_tokens_seen": 47185920, "step": 720, "train_runtime": 3559.9125, "train_tokens_per_second": 13254.798 }, { "epoch": 0.047843754096211825, "grad_norm": 0.19921875, "learning_rate": 4.995834714111633e-05, "loss": 1.5772, "num_input_tokens_seen": 47841280, "step": 730, "train_runtime": 3607.0789, "train_tokens_per_second": 13263.164 }, { "epoch": 0.04849914798794075, "grad_norm": 0.251953125, "learning_rate": 4.995522906234622e-05, "loss": 1.5654, "num_input_tokens_seen": 48496640, "step": 740, "train_runtime": 3657.2724, "train_tokens_per_second": 13260.33 }, { "epoch": 0.04915454187966968, "grad_norm": 0.248046875, "learning_rate": 4.995199855440065e-05, "loss": 1.588, "num_input_tokens_seen": 49152000, "step": 750, "train_runtime": 3704.8262, "train_tokens_per_second": 13267.019 }, { "epoch": 0.04980993577139861, "grad_norm": 0.2451171875, "learning_rate": 4.9948655631833794e-05, "loss": 1.5806, "num_input_tokens_seen": 49807360, "step": 760, "train_runtime": 3755.9627, "train_tokens_per_second": 13260.877 }, { "epoch": 0.05046532966312754, "grad_norm": 0.2470703125, "learning_rate": 4.9945200309706326e-05, "loss": 1.575, "num_input_tokens_seen": 50462720, "step": 770, "train_runtime": 3808.0737, "train_tokens_per_second": 13251.508 }, { "epoch": 0.051120723554856466, "grad_norm": 0.23046875, "learning_rate": 4.994163260358527e-05, "loss": 1.5894, "num_input_tokens_seen": 51118080, "step": 780, "train_runtime": 3858.7608, "train_tokens_per_second": 13247.279 }, { "epoch": 0.0517761174465854, "grad_norm": 0.255859375, "learning_rate": 4.993795252954398e-05, "loss": 1.5966, "num_input_tokens_seen": 51773440, "step": 790, "train_runtime": 3905.9626, "train_tokens_per_second": 13254.976 }, { "epoch": 0.05243151133831433, "grad_norm": 0.234375, "learning_rate": 4.993416010416208e-05, "loss": 1.6014, "num_input_tokens_seen": 52428800, "step": 800, "train_runtime": 3956.6521, "train_tokens_per_second": 13250.799 }, { "epoch": 0.05308690523004326, "grad_norm": 0.2021484375, "learning_rate": 4.99302553445253e-05, "loss": 1.5879, "num_input_tokens_seen": 53084160, "step": 810, "train_runtime": 4008.4409, "train_tokens_per_second": 13243.094 }, { "epoch": 0.05374229912177218, "grad_norm": 0.32421875, "learning_rate": 4.992623826822552e-05, "loss": 1.5716, "num_input_tokens_seen": 53739520, "step": 820, "train_runtime": 4059.6385, "train_tokens_per_second": 13237.514 }, { "epoch": 0.054397693013501114, "grad_norm": 0.2080078125, "learning_rate": 4.9922108893360624e-05, "loss": 1.5732, "num_input_tokens_seen": 54394880, "step": 830, "train_runtime": 4106.9088, "train_tokens_per_second": 13244.726 }, { "epoch": 0.055053086905230045, "grad_norm": 0.275390625, "learning_rate": 4.99178672385344e-05, "loss": 1.5857, "num_input_tokens_seen": 55050240, "step": 840, "train_runtime": 4157.852, "train_tokens_per_second": 13240.067 }, { "epoch": 0.055708480796958976, "grad_norm": 0.2158203125, "learning_rate": 4.9913513322856506e-05, "loss": 1.5815, "num_input_tokens_seen": 55705600, "step": 850, "train_runtime": 4205.0427, "train_tokens_per_second": 13247.333 }, { "epoch": 0.0563638746886879, "grad_norm": 0.23046875, "learning_rate": 4.9909047165942355e-05, "loss": 1.5668, "num_input_tokens_seen": 56360960, "step": 860, "train_runtime": 4255.5451, "train_tokens_per_second": 13244.122 }, { "epoch": 0.05701926858041683, "grad_norm": 0.208984375, "learning_rate": 4.990446878791304e-05, "loss": 1.5643, "num_input_tokens_seen": 57016320, "step": 870, "train_runtime": 4302.6838, "train_tokens_per_second": 13251.339 }, { "epoch": 0.05767466247214576, "grad_norm": 0.265625, "learning_rate": 4.989977820939522e-05, "loss": 1.573, "num_input_tokens_seen": 57671680, "step": 880, "train_runtime": 4353.5784, "train_tokens_per_second": 13246.96 }, { "epoch": 0.058330056363874686, "grad_norm": 0.2451171875, "learning_rate": 4.9894975451521064e-05, "loss": 1.5769, "num_input_tokens_seen": 58327040, "step": 890, "train_runtime": 4401.0234, "train_tokens_per_second": 13253.063 }, { "epoch": 0.058985450255603616, "grad_norm": 0.2119140625, "learning_rate": 4.989006053592812e-05, "loss": 1.5812, "num_input_tokens_seen": 58982400, "step": 900, "train_runtime": 4452.0611, "train_tokens_per_second": 13248.336 }, { "epoch": 0.05964084414733255, "grad_norm": 0.197265625, "learning_rate": 4.9885033484759244e-05, "loss": 1.5695, "num_input_tokens_seen": 59637760, "step": 910, "train_runtime": 4499.4135, "train_tokens_per_second": 13254.563 }, { "epoch": 0.06029623803906148, "grad_norm": 0.2041015625, "learning_rate": 4.9879894320662476e-05, "loss": 1.5557, "num_input_tokens_seen": 60293120, "step": 920, "train_runtime": 4550.2568, "train_tokens_per_second": 13250.487 }, { "epoch": 0.0609516319307904, "grad_norm": 0.2373046875, "learning_rate": 4.987464306679096e-05, "loss": 1.5656, "num_input_tokens_seen": 60948480, "step": 930, "train_runtime": 4597.5094, "train_tokens_per_second": 13256.847 }, { "epoch": 0.06160702582251933, "grad_norm": 0.234375, "learning_rate": 4.9869279746802844e-05, "loss": 1.5529, "num_input_tokens_seen": 61603840, "step": 940, "train_runtime": 4648.5718, "train_tokens_per_second": 13252.208 }, { "epoch": 0.062262419714248264, "grad_norm": 0.216796875, "learning_rate": 4.986380438486113e-05, "loss": 1.5911, "num_input_tokens_seen": 62259200, "step": 950, "train_runtime": 4696.0121, "train_tokens_per_second": 13257.888 }, { "epoch": 0.06291781360597719, "grad_norm": 0.2138671875, "learning_rate": 4.985821700563361e-05, "loss": 1.5633, "num_input_tokens_seen": 62914560, "step": 960, "train_runtime": 4746.7623, "train_tokens_per_second": 13254.205 }, { "epoch": 0.06357320749770612, "grad_norm": 0.318359375, "learning_rate": 4.985251763429275e-05, "loss": 1.5809, "num_input_tokens_seen": 63569920, "step": 970, "train_runtime": 4794.2615, "train_tokens_per_second": 13259.586 }, { "epoch": 0.06422860138943505, "grad_norm": 0.224609375, "learning_rate": 4.984670629651555e-05, "loss": 1.5697, "num_input_tokens_seen": 64225280, "step": 980, "train_runtime": 4845.2768, "train_tokens_per_second": 13255.235 }, { "epoch": 0.06488399528116398, "grad_norm": 0.314453125, "learning_rate": 4.984078301848347e-05, "loss": 1.5638, "num_input_tokens_seen": 64880640, "step": 990, "train_runtime": 4893.0218, "train_tokens_per_second": 13259.831 }, { "epoch": 0.06553938917289291, "grad_norm": 0.2080078125, "learning_rate": 4.983474782688226e-05, "loss": 1.5739, "num_input_tokens_seen": 65536000, "step": 1000, "train_runtime": 4944.2063, "train_tokens_per_second": 13255.11 }, { "epoch": 0.06619478306462184, "grad_norm": 0.2119140625, "learning_rate": 4.982860074890187e-05, "loss": 1.5783, "num_input_tokens_seen": 66191360, "step": 1010, "train_runtime": 4990.6255, "train_tokens_per_second": 13263.139 }, { "epoch": 0.06685017695635077, "grad_norm": 0.205078125, "learning_rate": 4.982234181223633e-05, "loss": 1.5805, "num_input_tokens_seen": 66846720, "step": 1020, "train_runtime": 5041.6388, "train_tokens_per_second": 13258.927 }, { "epoch": 0.06750557084807969, "grad_norm": 0.259765625, "learning_rate": 4.981597104508364e-05, "loss": 1.5818, "num_input_tokens_seen": 67502080, "step": 1030, "train_runtime": 5089.2528, "train_tokens_per_second": 13263.652 }, { "epoch": 0.06816096473980862, "grad_norm": 0.212890625, "learning_rate": 4.9809488476145584e-05, "loss": 1.5711, "num_input_tokens_seen": 68157440, "step": 1040, "train_runtime": 5142.0432, "train_tokens_per_second": 13254.933 }, { "epoch": 0.06881635863153755, "grad_norm": 0.236328125, "learning_rate": 4.9802894134627675e-05, "loss": 1.5688, "num_input_tokens_seen": 68812800, "step": 1050, "train_runtime": 5189.3784, "train_tokens_per_second": 13260.317 }, { "epoch": 0.06947175252326648, "grad_norm": 0.2197265625, "learning_rate": 4.979618805023897e-05, "loss": 1.5825, "num_input_tokens_seen": 69468160, "step": 1060, "train_runtime": 5239.782, "train_tokens_per_second": 13257.834 }, { "epoch": 0.07012714641499541, "grad_norm": 0.2177734375, "learning_rate": 4.978937025319195e-05, "loss": 1.5779, "num_input_tokens_seen": 70123520, "step": 1070, "train_runtime": 5286.4512, "train_tokens_per_second": 13264.762 }, { "epoch": 0.07078254030672435, "grad_norm": 0.2177734375, "learning_rate": 4.97824407742024e-05, "loss": 1.5958, "num_input_tokens_seen": 70778880, "step": 1080, "train_runtime": 5337.3844, "train_tokens_per_second": 13260.967 }, { "epoch": 0.07143793419845328, "grad_norm": 0.201171875, "learning_rate": 4.9775399644489245e-05, "loss": 1.5839, "num_input_tokens_seen": 71434240, "step": 1090, "train_runtime": 5385.5174, "train_tokens_per_second": 13264.137 }, { "epoch": 0.0720933280901822, "grad_norm": 0.2314453125, "learning_rate": 4.9768246895774436e-05, "loss": 1.5533, "num_input_tokens_seen": 72089600, "step": 1100, "train_runtime": 5435.233, "train_tokens_per_second": 13263.387 }, { "epoch": 0.07274872198191112, "grad_norm": 0.2373046875, "learning_rate": 4.976098256028279e-05, "loss": 1.5631, "num_input_tokens_seen": 72744960, "step": 1110, "train_runtime": 5484.2568, "train_tokens_per_second": 13264.324 }, { "epoch": 0.07340411587364006, "grad_norm": 0.20703125, "learning_rate": 4.975360667074184e-05, "loss": 1.5558, "num_input_tokens_seen": 73400320, "step": 1120, "train_runtime": 5533.3998, "train_tokens_per_second": 13264.959 }, { "epoch": 0.07405950976536899, "grad_norm": 0.2197265625, "learning_rate": 4.97461192603817e-05, "loss": 1.5637, "num_input_tokens_seen": 74055680, "step": 1130, "train_runtime": 5582.4399, "train_tokens_per_second": 13265.827 }, { "epoch": 0.07471490365709792, "grad_norm": 0.208984375, "learning_rate": 4.973852036293493e-05, "loss": 1.5899, "num_input_tokens_seen": 74711040, "step": 1140, "train_runtime": 5631.153, "train_tokens_per_second": 13267.45 }, { "epoch": 0.07537029754882685, "grad_norm": 0.2421875, "learning_rate": 4.973081001263633e-05, "loss": 1.5482, "num_input_tokens_seen": 75366400, "step": 1150, "train_runtime": 5680.2675, "train_tokens_per_second": 13268.108 }, { "epoch": 0.07602569144055578, "grad_norm": 0.224609375, "learning_rate": 4.972298824422286e-05, "loss": 1.5685, "num_input_tokens_seen": 76021760, "step": 1160, "train_runtime": 5729.0866, "train_tokens_per_second": 13269.438 }, { "epoch": 0.07668108533228471, "grad_norm": 0.1650390625, "learning_rate": 4.971505509293342e-05, "loss": 1.5469, "num_input_tokens_seen": 76677120, "step": 1170, "train_runtime": 5779.6477, "train_tokens_per_second": 13266.746 }, { "epoch": 0.07733647922401363, "grad_norm": 0.19921875, "learning_rate": 4.970701059450872e-05, "loss": 1.5695, "num_input_tokens_seen": 77332480, "step": 1180, "train_runtime": 5827.1528, "train_tokens_per_second": 13271.058 }, { "epoch": 0.07799187311574256, "grad_norm": 0.2412109375, "learning_rate": 4.9698854785191106e-05, "loss": 1.5536, "num_input_tokens_seen": 77987840, "step": 1190, "train_runtime": 5877.6373, "train_tokens_per_second": 13268.57 }, { "epoch": 0.07864726700747149, "grad_norm": 0.248046875, "learning_rate": 4.969058770172444e-05, "loss": 1.562, "num_input_tokens_seen": 78643200, "step": 1200, "train_runtime": 5924.2136, "train_tokens_per_second": 13274.876 }, { "epoch": 0.07930266089920042, "grad_norm": 0.22265625, "learning_rate": 4.968220938135386e-05, "loss": 1.5573, "num_input_tokens_seen": 79298560, "step": 1210, "train_runtime": 5974.9273, "train_tokens_per_second": 13271.887 }, { "epoch": 0.07995805479092935, "grad_norm": 0.216796875, "learning_rate": 4.967371986182568e-05, "loss": 1.5506, "num_input_tokens_seen": 79953920, "step": 1220, "train_runtime": 6022.3411, "train_tokens_per_second": 13276.219 }, { "epoch": 0.08061344868265828, "grad_norm": 0.1787109375, "learning_rate": 4.9665119181387164e-05, "loss": 1.5653, "num_input_tokens_seen": 80609280, "step": 1230, "train_runtime": 6073.3429, "train_tokens_per_second": 13272.638 }, { "epoch": 0.08126884257438721, "grad_norm": 0.2109375, "learning_rate": 4.9656407378786426e-05, "loss": 1.562, "num_input_tokens_seen": 81264640, "step": 1240, "train_runtime": 6120.8513, "train_tokens_per_second": 13276.689 }, { "epoch": 0.08192423646611613, "grad_norm": 0.2265625, "learning_rate": 4.964758449327216e-05, "loss": 1.5721, "num_input_tokens_seen": 81920000, "step": 1250, "train_runtime": 6171.9187, "train_tokens_per_second": 13273.02 }, { "epoch": 0.08257963035784506, "grad_norm": 0.1982421875, "learning_rate": 4.9638650564593565e-05, "loss": 1.5634, "num_input_tokens_seen": 82575360, "step": 1260, "train_runtime": 6219.2583, "train_tokens_per_second": 13277.365 }, { "epoch": 0.08323502424957399, "grad_norm": 0.1806640625, "learning_rate": 4.962960563300007e-05, "loss": 1.5727, "num_input_tokens_seen": 83230720, "step": 1270, "train_runtime": 6270.3793, "train_tokens_per_second": 13273.634 }, { "epoch": 0.08389041814130292, "grad_norm": 0.1962890625, "learning_rate": 4.9620449739241226e-05, "loss": 1.5462, "num_input_tokens_seen": 83886080, "step": 1280, "train_runtime": 6317.9379, "train_tokens_per_second": 13277.446 }, { "epoch": 0.08454581203303185, "grad_norm": 0.17578125, "learning_rate": 4.961118292456649e-05, "loss": 1.5645, "num_input_tokens_seen": 84541440, "step": 1290, "train_runtime": 6369.0393, "train_tokens_per_second": 13273.814 }, { "epoch": 0.08520120592476078, "grad_norm": 0.1826171875, "learning_rate": 4.960180523072504e-05, "loss": 1.5741, "num_input_tokens_seen": 85196800, "step": 1300, "train_runtime": 6416.2311, "train_tokens_per_second": 13278.325 }, { "epoch": 0.08585659981648971, "grad_norm": 0.1953125, "learning_rate": 4.959231669996559e-05, "loss": 1.5654, "num_input_tokens_seen": 85852160, "step": 1310, "train_runtime": 6467.175, "train_tokens_per_second": 13275.064 }, { "epoch": 0.08651199370821865, "grad_norm": 0.177734375, "learning_rate": 4.9582717375036206e-05, "loss": 1.5555, "num_input_tokens_seen": 86507520, "step": 1320, "train_runtime": 6514.5893, "train_tokens_per_second": 13279.044 }, { "epoch": 0.08716738759994756, "grad_norm": 0.228515625, "learning_rate": 4.9573007299184106e-05, "loss": 1.5517, "num_input_tokens_seen": 87162880, "step": 1330, "train_runtime": 6565.3965, "train_tokens_per_second": 13276.103 }, { "epoch": 0.0878227814916765, "grad_norm": 0.2177734375, "learning_rate": 4.956318651615545e-05, "loss": 1.5549, "num_input_tokens_seen": 87818240, "step": 1340, "train_runtime": 6612.6718, "train_tokens_per_second": 13280.296 }, { "epoch": 0.08847817538340542, "grad_norm": 0.2158203125, "learning_rate": 4.9553255070195195e-05, "loss": 1.5466, "num_input_tokens_seen": 88473600, "step": 1350, "train_runtime": 6663.1854, "train_tokens_per_second": 13277.974 }, { "epoch": 0.08913356927513436, "grad_norm": 0.234375, "learning_rate": 4.954321300604683e-05, "loss": 1.5699, "num_input_tokens_seen": 89128960, "step": 1360, "train_runtime": 6710.4402, "train_tokens_per_second": 13282.133 }, { "epoch": 0.08978896316686329, "grad_norm": 0.185546875, "learning_rate": 4.953306036895221e-05, "loss": 1.5488, "num_input_tokens_seen": 89784320, "step": 1370, "train_runtime": 6761.289, "train_tokens_per_second": 13279.172 }, { "epoch": 0.09044435705859222, "grad_norm": 0.181640625, "learning_rate": 4.952279720465137e-05, "loss": 1.5556, "num_input_tokens_seen": 90439680, "step": 1380, "train_runtime": 6808.9833, "train_tokens_per_second": 13282.406 }, { "epoch": 0.09109975095032115, "grad_norm": 0.2197265625, "learning_rate": 4.951242355938227e-05, "loss": 1.5477, "num_input_tokens_seen": 91095040, "step": 1390, "train_runtime": 6860.0552, "train_tokens_per_second": 13279.053 }, { "epoch": 0.09175514484205007, "grad_norm": 0.224609375, "learning_rate": 4.950193947988061e-05, "loss": 1.5396, "num_input_tokens_seen": 91750400, "step": 1400, "train_runtime": 6907.4565, "train_tokens_per_second": 13282.805 }, { "epoch": 0.092410538733779, "grad_norm": 0.1904296875, "learning_rate": 4.949134501337965e-05, "loss": 1.5546, "num_input_tokens_seen": 92405760, "step": 1410, "train_runtime": 6957.843, "train_tokens_per_second": 13280.806 }, { "epoch": 0.09306593262550793, "grad_norm": 0.177734375, "learning_rate": 4.948064020760995e-05, "loss": 1.5485, "num_input_tokens_seen": 93061120, "step": 1420, "train_runtime": 7005.1027, "train_tokens_per_second": 13284.762 }, { "epoch": 0.09372132651723686, "grad_norm": 0.2119140625, "learning_rate": 4.946982511079916e-05, "loss": 1.5622, "num_input_tokens_seen": 93716480, "step": 1430, "train_runtime": 7055.8419, "train_tokens_per_second": 13282.112 }, { "epoch": 0.09437672040896579, "grad_norm": 0.189453125, "learning_rate": 4.945889977167186e-05, "loss": 1.5506, "num_input_tokens_seen": 94371840, "step": 1440, "train_runtime": 7102.9715, "train_tokens_per_second": 13286.248 }, { "epoch": 0.09503211430069472, "grad_norm": 0.2041015625, "learning_rate": 4.944786423944926e-05, "loss": 1.5708, "num_input_tokens_seen": 95027200, "step": 1450, "train_runtime": 7153.892, "train_tokens_per_second": 13283.287 }, { "epoch": 0.09568750819242365, "grad_norm": 0.24609375, "learning_rate": 4.9436718563849026e-05, "loss": 1.5733, "num_input_tokens_seen": 95682560, "step": 1460, "train_runtime": 7200.7881, "train_tokens_per_second": 13287.79 }, { "epoch": 0.09634290208415258, "grad_norm": 0.216796875, "learning_rate": 4.942546279508505e-05, "loss": 1.5619, "num_input_tokens_seen": 96337920, "step": 1470, "train_runtime": 7251.7726, "train_tokens_per_second": 13284.741 }, { "epoch": 0.0969982959758815, "grad_norm": 0.1787109375, "learning_rate": 4.941409698386722e-05, "loss": 1.5629, "num_input_tokens_seen": 96993280, "step": 1480, "train_runtime": 7298.9447, "train_tokens_per_second": 13288.672 }, { "epoch": 0.09765368986761043, "grad_norm": 0.205078125, "learning_rate": 4.940262118140119e-05, "loss": 1.5579, "num_input_tokens_seen": 97648640, "step": 1490, "train_runtime": 7350.2117, "train_tokens_per_second": 13285.147 }, { "epoch": 0.09830908375933936, "grad_norm": 0.185546875, "learning_rate": 4.9391035439388134e-05, "loss": 1.5379, "num_input_tokens_seen": 98304000, "step": 1500, "train_runtime": 7398.3428, "train_tokens_per_second": 13287.3 }, { "epoch": 0.09896447765106829, "grad_norm": 0.2314453125, "learning_rate": 4.937933981002456e-05, "loss": 1.561, "num_input_tokens_seen": 98959360, "step": 1510, "train_runtime": 7448.8575, "train_tokens_per_second": 13285.173 }, { "epoch": 0.09961987154279722, "grad_norm": 0.197265625, "learning_rate": 4.936753434600202e-05, "loss": 1.5532, "num_input_tokens_seen": 99614720, "step": 1520, "train_runtime": 7497.651, "train_tokens_per_second": 13286.124 }, { "epoch": 0.10027526543452615, "grad_norm": 0.162109375, "learning_rate": 4.9355619100506914e-05, "loss": 1.5548, "num_input_tokens_seen": 100270080, "step": 1530, "train_runtime": 7555.8462, "train_tokens_per_second": 13270.529 }, { "epoch": 0.10093065932625508, "grad_norm": 0.189453125, "learning_rate": 4.934359412722022e-05, "loss": 1.5474, "num_input_tokens_seen": 100925440, "step": 1540, "train_runtime": 7603.1525, "train_tokens_per_second": 13274.157 }, { "epoch": 0.101586053217984, "grad_norm": 0.2421875, "learning_rate": 4.9331459480317264e-05, "loss": 1.5422, "num_input_tokens_seen": 101580800, "step": 1550, "train_runtime": 7654.1324, "train_tokens_per_second": 13271.367 }, { "epoch": 0.10224144710971293, "grad_norm": 0.59375, "learning_rate": 4.93192152144675e-05, "loss": 1.54, "num_input_tokens_seen": 102236160, "step": 1560, "train_runtime": 7701.2852, "train_tokens_per_second": 13275.208 }, { "epoch": 0.10289684100144186, "grad_norm": 0.18359375, "learning_rate": 4.93068613848342e-05, "loss": 1.5575, "num_input_tokens_seen": 102891520, "step": 1570, "train_runtime": 7752.3977, "train_tokens_per_second": 13272.219 }, { "epoch": 0.1035522348931708, "grad_norm": 0.2041015625, "learning_rate": 4.92943980470743e-05, "loss": 1.5255, "num_input_tokens_seen": 103546880, "step": 1580, "train_runtime": 7799.7148, "train_tokens_per_second": 13275.726 }, { "epoch": 0.10420762878489973, "grad_norm": 0.162109375, "learning_rate": 4.928182525733805e-05, "loss": 1.5554, "num_input_tokens_seen": 104202240, "step": 1590, "train_runtime": 7850.4718, "train_tokens_per_second": 13273.373 }, { "epoch": 0.10486302267662866, "grad_norm": 0.216796875, "learning_rate": 4.9269143072268834e-05, "loss": 1.5559, "num_input_tokens_seen": 104857600, "step": 1600, "train_runtime": 7897.5066, "train_tokens_per_second": 13277.304 }, { "epoch": 0.10551841656835759, "grad_norm": 0.171875, "learning_rate": 4.9256351549002864e-05, "loss": 1.5732, "num_input_tokens_seen": 105512960, "step": 1610, "train_runtime": 7948.6254, "train_tokens_per_second": 13274.366 }, { "epoch": 0.10617381046008652, "grad_norm": 0.201171875, "learning_rate": 4.9243450745168975e-05, "loss": 1.5312, "num_input_tokens_seen": 106168320, "step": 1620, "train_runtime": 7998.7345, "train_tokens_per_second": 13273.14 }, { "epoch": 0.10682920435181543, "grad_norm": 0.203125, "learning_rate": 4.923044071888831e-05, "loss": 1.5816, "num_input_tokens_seen": 106823680, "step": 1630, "train_runtime": 8049.8848, "train_tokens_per_second": 13270.212 }, { "epoch": 0.10748459824354437, "grad_norm": 0.2275390625, "learning_rate": 4.921732152877411e-05, "loss": 1.5452, "num_input_tokens_seen": 107479040, "step": 1640, "train_runtime": 8097.5808, "train_tokens_per_second": 13272.982 }, { "epoch": 0.1081399921352733, "grad_norm": 0.21875, "learning_rate": 4.920409323393141e-05, "loss": 1.5512, "num_input_tokens_seen": 108134400, "step": 1650, "train_runtime": 8148.5782, "train_tokens_per_second": 13270.34 }, { "epoch": 0.10879538602700223, "grad_norm": 0.2080078125, "learning_rate": 4.919075589395678e-05, "loss": 1.5693, "num_input_tokens_seen": 108789760, "step": 1660, "train_runtime": 8195.8423, "train_tokens_per_second": 13273.774 }, { "epoch": 0.10945077991873116, "grad_norm": 0.166015625, "learning_rate": 4.917730956893807e-05, "loss": 1.5643, "num_input_tokens_seen": 109445120, "step": 1670, "train_runtime": 8246.9895, "train_tokens_per_second": 13270.918 }, { "epoch": 0.11010617381046009, "grad_norm": 0.21875, "learning_rate": 4.916375431945415e-05, "loss": 1.5695, "num_input_tokens_seen": 110100480, "step": 1680, "train_runtime": 8294.444, "train_tokens_per_second": 13274.004 }, { "epoch": 0.11076156770218902, "grad_norm": 0.18359375, "learning_rate": 4.91500902065746e-05, "loss": 1.5508, "num_input_tokens_seen": 110755840, "step": 1690, "train_runtime": 8345.2826, "train_tokens_per_second": 13271.67 }, { "epoch": 0.11141696159391795, "grad_norm": 0.1806640625, "learning_rate": 4.9136317291859465e-05, "loss": 1.5605, "num_input_tokens_seen": 111411200, "step": 1700, "train_runtime": 8393.9311, "train_tokens_per_second": 13272.828 }, { "epoch": 0.11207235548564687, "grad_norm": 0.2158203125, "learning_rate": 4.912243563735895e-05, "loss": 1.5563, "num_input_tokens_seen": 112066560, "step": 1710, "train_runtime": 8443.7663, "train_tokens_per_second": 13272.106 }, { "epoch": 0.1127277493773758, "grad_norm": 0.171875, "learning_rate": 4.9108445305613196e-05, "loss": 1.5407, "num_input_tokens_seen": 112721920, "step": 1720, "train_runtime": 8492.4349, "train_tokens_per_second": 13273.216 }, { "epoch": 0.11338314326910473, "grad_norm": 0.2138671875, "learning_rate": 4.909434635965192e-05, "loss": 1.5332, "num_input_tokens_seen": 113377280, "step": 1730, "train_runtime": 8541.5742, "train_tokens_per_second": 13273.581 }, { "epoch": 0.11403853716083366, "grad_norm": 0.2060546875, "learning_rate": 4.90801388629942e-05, "loss": 1.5332, "num_input_tokens_seen": 114032640, "step": 1740, "train_runtime": 8590.5735, "train_tokens_per_second": 13274.159 }, { "epoch": 0.11469393105256259, "grad_norm": 0.2158203125, "learning_rate": 4.906582287964814e-05, "loss": 1.5516, "num_input_tokens_seen": 114688000, "step": 1750, "train_runtime": 8639.9892, "train_tokens_per_second": 13274.091 }, { "epoch": 0.11534932494429152, "grad_norm": 0.1845703125, "learning_rate": 4.905139847411062e-05, "loss": 1.5648, "num_input_tokens_seen": 115343360, "step": 1760, "train_runtime": 8689.457, "train_tokens_per_second": 13273.943 }, { "epoch": 0.11600471883602045, "grad_norm": 0.208984375, "learning_rate": 4.9036865711366976e-05, "loss": 1.5538, "num_input_tokens_seen": 115998720, "step": 1770, "train_runtime": 8738.4993, "train_tokens_per_second": 13274.444 }, { "epoch": 0.11666011272774937, "grad_norm": 0.1865234375, "learning_rate": 4.902222465689071e-05, "loss": 1.5662, "num_input_tokens_seen": 116654080, "step": 1780, "train_runtime": 8788.1489, "train_tokens_per_second": 13274.022 }, { "epoch": 0.1173155066194783, "grad_norm": 0.1767578125, "learning_rate": 4.900747537664324e-05, "loss": 1.5507, "num_input_tokens_seen": 117309440, "step": 1790, "train_runtime": 8836.838, "train_tokens_per_second": 13275.047 }, { "epoch": 0.11797090051120723, "grad_norm": 0.189453125, "learning_rate": 4.899261793707353e-05, "loss": 1.5691, "num_input_tokens_seen": 117964800, "step": 1800, "train_runtime": 8887.8229, "train_tokens_per_second": 13272.632 }, { "epoch": 0.11862629440293616, "grad_norm": 0.189453125, "learning_rate": 4.8977652405117826e-05, "loss": 1.5694, "num_input_tokens_seen": 118620160, "step": 1810, "train_runtime": 8935.4266, "train_tokens_per_second": 13275.265 }, { "epoch": 0.1192816882946651, "grad_norm": 0.1943359375, "learning_rate": 4.8962578848199384e-05, "loss": 1.5769, "num_input_tokens_seen": 119275520, "step": 1820, "train_runtime": 8986.5605, "train_tokens_per_second": 13272.655 }, { "epoch": 0.11993708218639403, "grad_norm": 0.1884765625, "learning_rate": 4.8947397334228125e-05, "loss": 1.5405, "num_input_tokens_seen": 119930880, "step": 1830, "train_runtime": 9033.6701, "train_tokens_per_second": 13275.986 }, { "epoch": 0.12059247607812296, "grad_norm": 0.173828125, "learning_rate": 4.893210793160032e-05, "loss": 1.5601, "num_input_tokens_seen": 120586240, "step": 1840, "train_runtime": 9084.6984, "train_tokens_per_second": 13273.555 }, { "epoch": 0.12124786996985189, "grad_norm": 0.2255859375, "learning_rate": 4.891671070919835e-05, "loss": 1.5492, "num_input_tokens_seen": 121241600, "step": 1850, "train_runtime": 9132.29, "train_tokens_per_second": 13276.144 }, { "epoch": 0.1219032638615808, "grad_norm": 0.177734375, "learning_rate": 4.890120573639031e-05, "loss": 1.5413, "num_input_tokens_seen": 121896960, "step": 1860, "train_runtime": 9183.6701, "train_tokens_per_second": 13273.229 }, { "epoch": 0.12255865775330974, "grad_norm": 0.1943359375, "learning_rate": 4.8885593083029747e-05, "loss": 1.5263, "num_input_tokens_seen": 122552320, "step": 1870, "train_runtime": 9231.7926, "train_tokens_per_second": 13275.03 }, { "epoch": 0.12321405164503867, "grad_norm": 0.1875, "learning_rate": 4.886987281945533e-05, "loss": 1.5315, "num_input_tokens_seen": 123207680, "step": 1880, "train_runtime": 9282.8329, "train_tokens_per_second": 13272.638 }, { "epoch": 0.1238694455367676, "grad_norm": 0.1982421875, "learning_rate": 4.885404501649056e-05, "loss": 1.5662, "num_input_tokens_seen": 123863040, "step": 1890, "train_runtime": 9330.1966, "train_tokens_per_second": 13275.502 }, { "epoch": 0.12452483942849653, "grad_norm": 0.1875, "learning_rate": 4.8838109745443394e-05, "loss": 1.5689, "num_input_tokens_seen": 124518400, "step": 1900, "train_runtime": 9381.2589, "train_tokens_per_second": 13273.101 }, { "epoch": 0.12518023332022546, "grad_norm": 0.263671875, "learning_rate": 4.882206707810599e-05, "loss": 1.543, "num_input_tokens_seen": 125173760, "step": 1910, "train_runtime": 9428.8624, "train_tokens_per_second": 13275.595 }, { "epoch": 0.12583562721195438, "grad_norm": 0.1748046875, "learning_rate": 4.880591708675433e-05, "loss": 1.5616, "num_input_tokens_seen": 125829120, "step": 1920, "train_runtime": 9479.7961, "train_tokens_per_second": 13273.399 }, { "epoch": 0.12649102110368332, "grad_norm": 0.2041015625, "learning_rate": 4.878965984414792e-05, "loss": 1.5459, "num_input_tokens_seen": 126484480, "step": 1930, "train_runtime": 9527.2631, "train_tokens_per_second": 13276.056 }, { "epoch": 0.12714641499541224, "grad_norm": 0.16796875, "learning_rate": 4.8773295423529466e-05, "loss": 1.5641, "num_input_tokens_seen": 127139840, "step": 1940, "train_runtime": 9577.9898, "train_tokens_per_second": 13274.167 }, { "epoch": 0.12780180888714118, "grad_norm": 0.2294921875, "learning_rate": 4.875682389862453e-05, "loss": 1.5551, "num_input_tokens_seen": 127795200, "step": 1950, "train_runtime": 9625.4214, "train_tokens_per_second": 13276.842 }, { "epoch": 0.1284572027788701, "grad_norm": 0.1865234375, "learning_rate": 4.87402453436412e-05, "loss": 1.5485, "num_input_tokens_seen": 128450560, "step": 1960, "train_runtime": 9676.5377, "train_tokens_per_second": 13274.434 }, { "epoch": 0.12911259667059902, "grad_norm": 0.205078125, "learning_rate": 4.872355983326977e-05, "loss": 1.5692, "num_input_tokens_seen": 129105920, "step": 1970, "train_runtime": 9724.0882, "train_tokens_per_second": 13276.918 }, { "epoch": 0.12976799056232796, "grad_norm": 0.1953125, "learning_rate": 4.870676744268239e-05, "loss": 1.5746, "num_input_tokens_seen": 129761280, "step": 1980, "train_runtime": 9775.0653, "train_tokens_per_second": 13274.723 }, { "epoch": 0.13042338445405688, "grad_norm": 0.2265625, "learning_rate": 4.868986824753273e-05, "loss": 1.569, "num_input_tokens_seen": 130416640, "step": 1990, "train_runtime": 9822.5181, "train_tokens_per_second": 13277.312 }, { "epoch": 0.13107877834578582, "grad_norm": 0.1796875, "learning_rate": 4.8672862323955634e-05, "loss": 1.5818, "num_input_tokens_seen": 131072000, "step": 2000, "train_runtime": 9873.3834, "train_tokens_per_second": 13275.287 }, { "epoch": 0.13173417223751474, "grad_norm": 0.1689453125, "learning_rate": 4.8655749748566787e-05, "loss": 1.5659, "num_input_tokens_seen": 131727360, "step": 2010, "train_runtime": 9919.9718, "train_tokens_per_second": 13279.005 }, { "epoch": 0.13238956612924369, "grad_norm": 0.1904296875, "learning_rate": 4.863853059846236e-05, "loss": 1.5531, "num_input_tokens_seen": 132382720, "step": 2020, "train_runtime": 9970.9151, "train_tokens_per_second": 13276.888 }, { "epoch": 0.1330449600209726, "grad_norm": 0.2333984375, "learning_rate": 4.862120495121869e-05, "loss": 1.5625, "num_input_tokens_seen": 133038080, "step": 2030, "train_runtime": 10017.687, "train_tokens_per_second": 13280.319 }, { "epoch": 0.13370035391270155, "grad_norm": 0.1865234375, "learning_rate": 4.860377288489187e-05, "loss": 1.568, "num_input_tokens_seen": 133693440, "step": 2040, "train_runtime": 10068.3237, "train_tokens_per_second": 13278.62 }, { "epoch": 0.13435574780443046, "grad_norm": 0.1689453125, "learning_rate": 4.858623447801748e-05, "loss": 1.5504, "num_input_tokens_seen": 134348800, "step": 2050, "train_runtime": 10115.448, "train_tokens_per_second": 13281.547 }, { "epoch": 0.13501114169615938, "grad_norm": 0.1875, "learning_rate": 4.8568589809610155e-05, "loss": 1.5289, "num_input_tokens_seen": 135004160, "step": 2060, "train_runtime": 10166.4073, "train_tokens_per_second": 13279.437 }, { "epoch": 0.13566653558788833, "grad_norm": 0.197265625, "learning_rate": 4.855083895916327e-05, "loss": 1.5529, "num_input_tokens_seen": 135659520, "step": 2070, "train_runtime": 10213.8363, "train_tokens_per_second": 13281.936 }, { "epoch": 0.13632192947961724, "grad_norm": 0.189453125, "learning_rate": 4.85329820066486e-05, "loss": 1.5438, "num_input_tokens_seen": 136314880, "step": 2080, "train_runtime": 10264.6794, "train_tokens_per_second": 13279.994 }, { "epoch": 0.1369773233713462, "grad_norm": 0.1767578125, "learning_rate": 4.851501903251589e-05, "loss": 1.5461, "num_input_tokens_seen": 136970240, "step": 2090, "train_runtime": 10312.1495, "train_tokens_per_second": 13282.414 }, { "epoch": 0.1376327172630751, "grad_norm": 0.2021484375, "learning_rate": 4.849695011769259e-05, "loss": 1.5561, "num_input_tokens_seen": 137625600, "step": 2100, "train_runtime": 10362.8534, "train_tokens_per_second": 13280.666 }, { "epoch": 0.13828811115480405, "grad_norm": 0.2177734375, "learning_rate": 4.847877534358338e-05, "loss": 1.5457, "num_input_tokens_seen": 138280960, "step": 2110, "train_runtime": 10410.3974, "train_tokens_per_second": 13282.967 }, { "epoch": 0.13894350504653297, "grad_norm": 0.2041015625, "learning_rate": 4.846049479206992e-05, "loss": 1.5417, "num_input_tokens_seen": 138936320, "step": 2120, "train_runtime": 10461.4853, "train_tokens_per_second": 13280.745 }, { "epoch": 0.13959889893826188, "grad_norm": 0.1748046875, "learning_rate": 4.844210854551038e-05, "loss": 1.5296, "num_input_tokens_seen": 139591680, "step": 2130, "train_runtime": 10508.9536, "train_tokens_per_second": 13283.119 }, { "epoch": 0.14025429282999083, "grad_norm": 0.2021484375, "learning_rate": 4.842361668673912e-05, "loss": 1.5422, "num_input_tokens_seen": 140247040, "step": 2140, "train_runtime": 10559.9966, "train_tokens_per_second": 13280.974 }, { "epoch": 0.14090968672171975, "grad_norm": 0.1796875, "learning_rate": 4.840501929906633e-05, "loss": 1.5572, "num_input_tokens_seen": 140902400, "step": 2150, "train_runtime": 10607.373, "train_tokens_per_second": 13283.44 }, { "epoch": 0.1415650806134487, "grad_norm": 0.2255859375, "learning_rate": 4.8386316466277595e-05, "loss": 1.5464, "num_input_tokens_seen": 141557760, "step": 2160, "train_runtime": 10658.6936, "train_tokens_per_second": 13280.967 }, { "epoch": 0.1422204745051776, "grad_norm": 0.1806640625, "learning_rate": 4.836750827263359e-05, "loss": 1.537, "num_input_tokens_seen": 142213120, "step": 2170, "train_runtime": 10706.4568, "train_tokens_per_second": 13282.93 }, { "epoch": 0.14287586839690655, "grad_norm": 0.248046875, "learning_rate": 4.8348594802869635e-05, "loss": 1.5502, "num_input_tokens_seen": 142868480, "step": 2180, "train_runtime": 10756.7659, "train_tokens_per_second": 13281.732 }, { "epoch": 0.14353126228863547, "grad_norm": 0.173828125, "learning_rate": 4.832957614219537e-05, "loss": 1.5586, "num_input_tokens_seen": 143523840, "step": 2190, "train_runtime": 10805.5975, "train_tokens_per_second": 13282.36 }, { "epoch": 0.1441866561803644, "grad_norm": 0.2060546875, "learning_rate": 4.831045237629433e-05, "loss": 1.5482, "num_input_tokens_seen": 144179200, "step": 2200, "train_runtime": 10855.0685, "train_tokens_per_second": 13282.201 }, { "epoch": 0.14484205007209333, "grad_norm": 0.1787109375, "learning_rate": 4.8291223591323586e-05, "loss": 1.552, "num_input_tokens_seen": 144834560, "step": 2210, "train_runtime": 10904.2967, "train_tokens_per_second": 13282.339 }, { "epoch": 0.14549744396382225, "grad_norm": 0.169921875, "learning_rate": 4.8271889873913325e-05, "loss": 1.5717, "num_input_tokens_seen": 145489920, "step": 2220, "train_runtime": 10953.7205, "train_tokens_per_second": 13282.238 }, { "epoch": 0.1461528378555512, "grad_norm": 0.1845703125, "learning_rate": 4.8252451311166496e-05, "loss": 1.5468, "num_input_tokens_seen": 146145280, "step": 2230, "train_runtime": 11003.0128, "train_tokens_per_second": 13282.297 }, { "epoch": 0.1468082317472801, "grad_norm": 0.1875, "learning_rate": 4.823290799065839e-05, "loss": 1.5599, "num_input_tokens_seen": 146800640, "step": 2240, "train_runtime": 11052.0298, "train_tokens_per_second": 13282.686 }, { "epoch": 0.14746362563900905, "grad_norm": 0.19921875, "learning_rate": 4.8213260000436275e-05, "loss": 1.5582, "num_input_tokens_seen": 147456000, "step": 2250, "train_runtime": 11101.3249, "train_tokens_per_second": 13282.739 }, { "epoch": 0.14811901953073797, "grad_norm": 0.2119140625, "learning_rate": 4.8193507429018946e-05, "loss": 1.5546, "num_input_tokens_seen": 148111360, "step": 2260, "train_runtime": 11149.8755, "train_tokens_per_second": 13283.678 }, { "epoch": 0.1487744134224669, "grad_norm": 0.1689453125, "learning_rate": 4.8173650365396406e-05, "loss": 1.5644, "num_input_tokens_seen": 148766720, "step": 2270, "train_runtime": 11200.1123, "train_tokens_per_second": 13282.61 }, { "epoch": 0.14942980731419583, "grad_norm": 0.185546875, "learning_rate": 4.815368889902937e-05, "loss": 1.5687, "num_input_tokens_seen": 149422080, "step": 2280, "train_runtime": 11248.6716, "train_tokens_per_second": 13283.531 }, { "epoch": 0.15008520120592475, "grad_norm": 0.21484375, "learning_rate": 4.813362311984897e-05, "loss": 1.5448, "num_input_tokens_seen": 150077440, "step": 2290, "train_runtime": 11299.7353, "train_tokens_per_second": 13281.5 }, { "epoch": 0.1507405950976537, "grad_norm": 0.193359375, "learning_rate": 4.811345311825623e-05, "loss": 1.5514, "num_input_tokens_seen": 150732800, "step": 2300, "train_runtime": 11347.084, "train_tokens_per_second": 13283.836 }, { "epoch": 0.1513959889893826, "grad_norm": 0.2021484375, "learning_rate": 4.8093178985121776e-05, "loss": 1.554, "num_input_tokens_seen": 151388160, "step": 2310, "train_runtime": 11397.8489, "train_tokens_per_second": 13282.169 }, { "epoch": 0.15205138288111156, "grad_norm": 0.1826171875, "learning_rate": 4.807280081178532e-05, "loss": 1.5876, "num_input_tokens_seen": 152043520, "step": 2320, "train_runtime": 11445.5502, "train_tokens_per_second": 13284.073 }, { "epoch": 0.15270677677284047, "grad_norm": 0.2109375, "learning_rate": 4.805231869005536e-05, "loss": 1.5454, "num_input_tokens_seen": 152698880, "step": 2330, "train_runtime": 11496.5678, "train_tokens_per_second": 13282.128 }, { "epoch": 0.15336217066456942, "grad_norm": 0.2060546875, "learning_rate": 4.803173271220862e-05, "loss": 1.5552, "num_input_tokens_seen": 153354240, "step": 2340, "train_runtime": 11544.049, "train_tokens_per_second": 13284.268 }, { "epoch": 0.15401756455629834, "grad_norm": 0.1904296875, "learning_rate": 4.8011042970989815e-05, "loss": 1.5447, "num_input_tokens_seen": 154009600, "step": 2350, "train_runtime": 11594.9377, "train_tokens_per_second": 13282.486 }, { "epoch": 0.15467295844802725, "grad_norm": 0.1787109375, "learning_rate": 4.799024955961108e-05, "loss": 1.5201, "num_input_tokens_seen": 154664960, "step": 2360, "train_runtime": 11642.0709, "train_tokens_per_second": 13285.004 }, { "epoch": 0.1553283523397562, "grad_norm": 0.16796875, "learning_rate": 4.796935257175163e-05, "loss": 1.5804, "num_input_tokens_seen": 155320320, "step": 2370, "train_runtime": 11692.8852, "train_tokens_per_second": 13283.319 }, { "epoch": 0.15598374623148512, "grad_norm": 0.232421875, "learning_rate": 4.7948352101557284e-05, "loss": 1.573, "num_input_tokens_seen": 155975680, "step": 2380, "train_runtime": 11740.3689, "train_tokens_per_second": 13285.416 }, { "epoch": 0.15663914012321406, "grad_norm": 0.232421875, "learning_rate": 4.7927248243640125e-05, "loss": 1.5416, "num_input_tokens_seen": 156631040, "step": 2390, "train_runtime": 11791.2801, "train_tokens_per_second": 13283.633 }, { "epoch": 0.15729453401494298, "grad_norm": 0.19140625, "learning_rate": 4.790604109307799e-05, "loss": 1.5507, "num_input_tokens_seen": 157286400, "step": 2400, "train_runtime": 11838.7609, "train_tokens_per_second": 13285.715 }, { "epoch": 0.15794992790667192, "grad_norm": 0.1708984375, "learning_rate": 4.788473074541409e-05, "loss": 1.5487, "num_input_tokens_seen": 157941760, "step": 2410, "train_runtime": 11890.2501, "train_tokens_per_second": 13283.3 }, { "epoch": 0.15860532179840084, "grad_norm": 0.181640625, "learning_rate": 4.786331729665652e-05, "loss": 1.5432, "num_input_tokens_seen": 158597120, "step": 2420, "train_runtime": 11937.6644, "train_tokens_per_second": 13285.44 }, { "epoch": 0.15926071569012976, "grad_norm": 0.193359375, "learning_rate": 4.784180084327793e-05, "loss": 1.5482, "num_input_tokens_seen": 159252480, "step": 2430, "train_runtime": 11992.3311, "train_tokens_per_second": 13279.527 }, { "epoch": 0.1599161095818587, "grad_norm": 0.25, "learning_rate": 4.7820181482215e-05, "loss": 1.5226, "num_input_tokens_seen": 159907840, "step": 2440, "train_runtime": 12040.269, "train_tokens_per_second": 13281.085 }, { "epoch": 0.16057150347358762, "grad_norm": 0.1953125, "learning_rate": 4.779845931086802e-05, "loss": 1.5281, "num_input_tokens_seen": 160563200, "step": 2450, "train_runtime": 12091.3357, "train_tokens_per_second": 13279.195 }, { "epoch": 0.16122689736531656, "grad_norm": 0.1748046875, "learning_rate": 4.77766344271005e-05, "loss": 1.5276, "num_input_tokens_seen": 161218560, "step": 2460, "train_runtime": 12138.789, "train_tokens_per_second": 13281.272 }, { "epoch": 0.16188229125704548, "grad_norm": 0.1845703125, "learning_rate": 4.7754706929238655e-05, "loss": 1.5329, "num_input_tokens_seen": 161873920, "step": 2470, "train_runtime": 12189.8441, "train_tokens_per_second": 13279.409 }, { "epoch": 0.16253768514877442, "grad_norm": 0.2099609375, "learning_rate": 4.773267691607104e-05, "loss": 1.5498, "num_input_tokens_seen": 162529280, "step": 2480, "train_runtime": 12237.1535, "train_tokens_per_second": 13281.625 }, { "epoch": 0.16319307904050334, "grad_norm": 0.216796875, "learning_rate": 4.771054448684803e-05, "loss": 1.547, "num_input_tokens_seen": 163184640, "step": 2490, "train_runtime": 12288.2685, "train_tokens_per_second": 13279.71 }, { "epoch": 0.16384847293223226, "grad_norm": 0.21875, "learning_rate": 4.7688309741281426e-05, "loss": 1.5454, "num_input_tokens_seen": 163840000, "step": 2500, "train_runtime": 12335.9566, "train_tokens_per_second": 13281.499 }, { "epoch": 0.1645038668239612, "grad_norm": 0.1943359375, "learning_rate": 4.766597277954398e-05, "loss": 1.5285, "num_input_tokens_seen": 164495360, "step": 2510, "train_runtime": 12386.9676, "train_tokens_per_second": 13279.712 }, { "epoch": 0.16515926071569012, "grad_norm": 0.21484375, "learning_rate": 4.7643533702268975e-05, "loss": 1.5634, "num_input_tokens_seen": 165150720, "step": 2520, "train_runtime": 12434.339, "train_tokens_per_second": 13281.825 }, { "epoch": 0.16581465460741907, "grad_norm": 0.1884765625, "learning_rate": 4.762099261054971e-05, "loss": 1.5431, "num_input_tokens_seen": 165806080, "step": 2530, "train_runtime": 12484.8104, "train_tokens_per_second": 13280.625 }, { "epoch": 0.16647004849914798, "grad_norm": 0.1982421875, "learning_rate": 4.7598349605939115e-05, "loss": 1.5494, "num_input_tokens_seen": 166461440, "step": 2540, "train_runtime": 12532.1516, "train_tokens_per_second": 13282.75 }, { "epoch": 0.16712544239087693, "grad_norm": 0.1962890625, "learning_rate": 4.757560479044926e-05, "loss": 1.5421, "num_input_tokens_seen": 167116800, "step": 2550, "train_runtime": 12583.1966, "train_tokens_per_second": 13280.95 }, { "epoch": 0.16778083628260584, "grad_norm": 0.1806640625, "learning_rate": 4.7552758266550875e-05, "loss": 1.5335, "num_input_tokens_seen": 167772160, "step": 2560, "train_runtime": 12630.5914, "train_tokens_per_second": 13283.001 }, { "epoch": 0.1684362301743348, "grad_norm": 0.25, "learning_rate": 4.752981013717292e-05, "loss": 1.5291, "num_input_tokens_seen": 168427520, "step": 2570, "train_runtime": 12681.6764, "train_tokens_per_second": 13281.172 }, { "epoch": 0.1690916240660637, "grad_norm": 0.1796875, "learning_rate": 4.7506760505702117e-05, "loss": 1.5339, "num_input_tokens_seen": 169082880, "step": 2580, "train_runtime": 12729.0576, "train_tokens_per_second": 13283.221 }, { "epoch": 0.16974701795779262, "grad_norm": 0.162109375, "learning_rate": 4.7483609475982486e-05, "loss": 1.5567, "num_input_tokens_seen": 169738240, "step": 2590, "train_runtime": 12780.0234, "train_tokens_per_second": 13281.528 }, { "epoch": 0.17040241184952157, "grad_norm": 0.1669921875, "learning_rate": 4.7460357152314846e-05, "loss": 1.5205, "num_input_tokens_seen": 170393600, "step": 2600, "train_runtime": 12827.5604, "train_tokens_per_second": 13283.399 }, { "epoch": 0.17105780574125048, "grad_norm": 0.1796875, "learning_rate": 4.74370036394564e-05, "loss": 1.5139, "num_input_tokens_seen": 171048960, "step": 2610, "train_runtime": 12878.6814, "train_tokens_per_second": 13281.558 }, { "epoch": 0.17171319963297943, "grad_norm": 0.1943359375, "learning_rate": 4.74135490426202e-05, "loss": 1.5475, "num_input_tokens_seen": 171704320, "step": 2620, "train_runtime": 12926.1247, "train_tokens_per_second": 13283.511 }, { "epoch": 0.17236859352470835, "grad_norm": 0.185546875, "learning_rate": 4.738999346747472e-05, "loss": 1.5655, "num_input_tokens_seen": 172359680, "step": 2630, "train_runtime": 12977.2033, "train_tokens_per_second": 13281.728 }, { "epoch": 0.1730239874164373, "grad_norm": 0.234375, "learning_rate": 4.7366337020143374e-05, "loss": 1.5513, "num_input_tokens_seen": 173015040, "step": 2640, "train_runtime": 13024.6672, "train_tokens_per_second": 13283.644 }, { "epoch": 0.1736793813081662, "grad_norm": 0.1806640625, "learning_rate": 4.734257980720403e-05, "loss": 1.5312, "num_input_tokens_seen": 173670400, "step": 2650, "train_runtime": 13075.7912, "train_tokens_per_second": 13281.827 }, { "epoch": 0.17433477519989513, "grad_norm": 0.1904296875, "learning_rate": 4.7318721935688486e-05, "loss": 1.5369, "num_input_tokens_seen": 174325760, "step": 2660, "train_runtime": 13123.4979, "train_tokens_per_second": 13283.483 }, { "epoch": 0.17499016909162407, "grad_norm": 0.16796875, "learning_rate": 4.7294763513082095e-05, "loss": 1.535, "num_input_tokens_seen": 174981120, "step": 2670, "train_runtime": 13174.5389, "train_tokens_per_second": 13281.764 }, { "epoch": 0.175645562983353, "grad_norm": 0.1982421875, "learning_rate": 4.727070464732317e-05, "loss": 1.5399, "num_input_tokens_seen": 175636480, "step": 2680, "train_runtime": 13222.1172, "train_tokens_per_second": 13283.537 }, { "epoch": 0.17630095687508193, "grad_norm": 0.173828125, "learning_rate": 4.7246545446802574e-05, "loss": 1.5717, "num_input_tokens_seen": 176291840, "step": 2690, "train_runtime": 13273.1386, "train_tokens_per_second": 13281.85 }, { "epoch": 0.17695635076681085, "grad_norm": 0.1826171875, "learning_rate": 4.722228602036316e-05, "loss": 1.5596, "num_input_tokens_seen": 176947200, "step": 2700, "train_runtime": 13320.5805, "train_tokens_per_second": 13283.745 }, { "epoch": 0.1776117446585398, "grad_norm": 0.1826171875, "learning_rate": 4.7197926477299375e-05, "loss": 1.5639, "num_input_tokens_seen": 177602560, "step": 2710, "train_runtime": 13371.3394, "train_tokens_per_second": 13282.331 }, { "epoch": 0.1782671385502687, "grad_norm": 0.251953125, "learning_rate": 4.717346692735668e-05, "loss": 1.5671, "num_input_tokens_seen": 178257920, "step": 2720, "train_runtime": 13418.9115, "train_tokens_per_second": 13284.082 }, { "epoch": 0.17892253244199763, "grad_norm": 0.2099609375, "learning_rate": 4.7148907480731094e-05, "loss": 1.5407, "num_input_tokens_seen": 178913280, "step": 2730, "train_runtime": 13470.0283, "train_tokens_per_second": 13282.324 }, { "epoch": 0.17957792633372657, "grad_norm": 0.248046875, "learning_rate": 4.7124248248068706e-05, "loss": 1.549, "num_input_tokens_seen": 179568640, "step": 2740, "train_runtime": 13517.5158, "train_tokens_per_second": 13284.145 }, { "epoch": 0.1802333202254555, "grad_norm": 0.185546875, "learning_rate": 4.709948934046515e-05, "loss": 1.551, "num_input_tokens_seen": 180224000, "step": 2750, "train_runtime": 13568.4634, "train_tokens_per_second": 13282.565 }, { "epoch": 0.18088871411718443, "grad_norm": 0.1865234375, "learning_rate": 4.7074630869465145e-05, "loss": 1.5466, "num_input_tokens_seen": 180879360, "step": 2760, "train_runtime": 13615.9283, "train_tokens_per_second": 13284.394 }, { "epoch": 0.18154410800891335, "grad_norm": 0.16796875, "learning_rate": 4.704967294706193e-05, "loss": 1.5151, "num_input_tokens_seen": 181534720, "step": 2770, "train_runtime": 13667.032, "train_tokens_per_second": 13282.673 }, { "epoch": 0.1821995019006423, "grad_norm": 0.228515625, "learning_rate": 4.702461568569682e-05, "loss": 1.5511, "num_input_tokens_seen": 182190080, "step": 2780, "train_runtime": 13715.5093, "train_tokens_per_second": 13283.508 }, { "epoch": 0.1828548957923712, "grad_norm": 0.2470703125, "learning_rate": 4.699945919825868e-05, "loss": 1.542, "num_input_tokens_seen": 182845440, "step": 2790, "train_runtime": 13764.7959, "train_tokens_per_second": 13283.556 }, { "epoch": 0.18351028968410013, "grad_norm": 0.2041015625, "learning_rate": 4.6974203598083374e-05, "loss": 1.5322, "num_input_tokens_seen": 183500800, "step": 2800, "train_runtime": 13813.977, "train_tokens_per_second": 13283.705 }, { "epoch": 0.18416568357582908, "grad_norm": 0.220703125, "learning_rate": 4.694884899895335e-05, "loss": 1.5475, "num_input_tokens_seen": 184156160, "step": 2810, "train_runtime": 13863.4593, "train_tokens_per_second": 13283.565 }, { "epoch": 0.184821077467558, "grad_norm": 0.19921875, "learning_rate": 4.692339551509704e-05, "loss": 1.5304, "num_input_tokens_seen": 184811520, "step": 2820, "train_runtime": 13912.8124, "train_tokens_per_second": 13283.549 }, { "epoch": 0.18547647135928694, "grad_norm": 0.1923828125, "learning_rate": 4.689784326118835e-05, "loss": 1.5575, "num_input_tokens_seen": 185466880, "step": 2830, "train_runtime": 13962.361, "train_tokens_per_second": 13283.347 }, { "epoch": 0.18613186525101585, "grad_norm": 0.181640625, "learning_rate": 4.687219235234621e-05, "loss": 1.5644, "num_input_tokens_seen": 186122240, "step": 2840, "train_runtime": 14011.6649, "train_tokens_per_second": 13283.378 }, { "epoch": 0.1867872591427448, "grad_norm": 0.1728515625, "learning_rate": 4.6846442904133994e-05, "loss": 1.5544, "num_input_tokens_seen": 186777600, "step": 2850, "train_runtime": 14061.0002, "train_tokens_per_second": 13283.379 }, { "epoch": 0.18744265303447372, "grad_norm": 0.1767578125, "learning_rate": 4.6820595032559015e-05, "loss": 1.5417, "num_input_tokens_seen": 187432960, "step": 2860, "train_runtime": 14110.4475, "train_tokens_per_second": 13283.275 }, { "epoch": 0.18809804692620266, "grad_norm": 0.181640625, "learning_rate": 4.679464885407202e-05, "loss": 1.5476, "num_input_tokens_seen": 188088320, "step": 2870, "train_runtime": 14159.0632, "train_tokens_per_second": 13283.952 }, { "epoch": 0.18875344081793158, "grad_norm": 0.18359375, "learning_rate": 4.676860448556665e-05, "loss": 1.5418, "num_input_tokens_seen": 188743680, "step": 2880, "train_runtime": 14209.4168, "train_tokens_per_second": 13283.0 }, { "epoch": 0.1894088347096605, "grad_norm": 0.169921875, "learning_rate": 4.674246204437889e-05, "loss": 1.5477, "num_input_tokens_seen": 189399040, "step": 2890, "train_runtime": 14257.8701, "train_tokens_per_second": 13283.824 }, { "epoch": 0.19006422860138944, "grad_norm": 0.185546875, "learning_rate": 4.671622164828661e-05, "loss": 1.5268, "num_input_tokens_seen": 190054400, "step": 2900, "train_runtime": 14308.9899, "train_tokens_per_second": 13282.167 }, { "epoch": 0.19071962249311836, "grad_norm": 0.1689453125, "learning_rate": 4.668988341550894e-05, "loss": 1.5473, "num_input_tokens_seen": 190709760, "step": 2910, "train_runtime": 14356.7775, "train_tokens_per_second": 13283.605 }, { "epoch": 0.1913750163848473, "grad_norm": 0.193359375, "learning_rate": 4.6663447464705836e-05, "loss": 1.5299, "num_input_tokens_seen": 191365120, "step": 2920, "train_runtime": 14407.8888, "train_tokens_per_second": 13281.968 }, { "epoch": 0.19203041027657622, "grad_norm": 0.1796875, "learning_rate": 4.6636913914977454e-05, "loss": 1.5535, "num_input_tokens_seen": 192020480, "step": 2930, "train_runtime": 14455.3161, "train_tokens_per_second": 13283.727 }, { "epoch": 0.19268580416830516, "grad_norm": 0.1904296875, "learning_rate": 4.661028288586369e-05, "loss": 1.567, "num_input_tokens_seen": 192675840, "step": 2940, "train_runtime": 14506.5808, "train_tokens_per_second": 13281.961 }, { "epoch": 0.19334119806003408, "grad_norm": 0.1953125, "learning_rate": 4.658355449734358e-05, "loss": 1.556, "num_input_tokens_seen": 193331200, "step": 2950, "train_runtime": 14554.1126, "train_tokens_per_second": 13283.613 }, { "epoch": 0.193996591951763, "grad_norm": 0.181640625, "learning_rate": 4.655672886983481e-05, "loss": 1.5488, "num_input_tokens_seen": 193986560, "step": 2960, "train_runtime": 14608.2894, "train_tokens_per_second": 13279.211 }, { "epoch": 0.19465198584349194, "grad_norm": 0.173828125, "learning_rate": 4.6529806124193136e-05, "loss": 1.5496, "num_input_tokens_seen": 194641920, "step": 2970, "train_runtime": 14674.1891, "train_tokens_per_second": 13264.237 }, { "epoch": 0.19530737973522086, "grad_norm": 0.2294921875, "learning_rate": 4.650278638171186e-05, "loss": 1.5068, "num_input_tokens_seen": 195297280, "step": 2980, "train_runtime": 14727.5642, "train_tokens_per_second": 13260.664 }, { "epoch": 0.1959627736269498, "grad_norm": 0.2333984375, "learning_rate": 4.647566976412128e-05, "loss": 1.5425, "num_input_tokens_seen": 195952640, "step": 2990, "train_runtime": 14774.7394, "train_tokens_per_second": 13262.68 }, { "epoch": 0.19661816751867872, "grad_norm": 0.203125, "learning_rate": 4.644845639358812e-05, "loss": 1.5503, "num_input_tokens_seen": 196608000, "step": 3000, "train_runtime": 14825.7916, "train_tokens_per_second": 13261.214 }, { "epoch": 0.19727356141040767, "grad_norm": 0.2119140625, "learning_rate": 4.642114639271502e-05, "loss": 1.5561, "num_input_tokens_seen": 197263360, "step": 3010, "train_runtime": 14873.475, "train_tokens_per_second": 13262.762 }, { "epoch": 0.19792895530213658, "grad_norm": 0.19921875, "learning_rate": 4.639373988453997e-05, "loss": 1.5486, "num_input_tokens_seen": 197918720, "step": 3020, "train_runtime": 14924.3499, "train_tokens_per_second": 13261.463 }, { "epoch": 0.1985843491938655, "grad_norm": 0.169921875, "learning_rate": 4.636623699253573e-05, "loss": 1.5345, "num_input_tokens_seen": 198574080, "step": 3030, "train_runtime": 14971.6538, "train_tokens_per_second": 13263.336 }, { "epoch": 0.19923974308559445, "grad_norm": 0.203125, "learning_rate": 4.633863784060928e-05, "loss": 1.5261, "num_input_tokens_seen": 199229440, "step": 3040, "train_runtime": 15022.4123, "train_tokens_per_second": 13262.147 }, { "epoch": 0.19989513697732336, "grad_norm": 0.25390625, "learning_rate": 4.63109425531013e-05, "loss": 1.522, "num_input_tokens_seen": 199884800, "step": 3050, "train_runtime": 15070.1454, "train_tokens_per_second": 13263.628 }, { "epoch": 0.2005505308690523, "grad_norm": 0.185546875, "learning_rate": 4.6283151254785576e-05, "loss": 1.5265, "num_input_tokens_seen": 200540160, "step": 3060, "train_runtime": 15127.651, "train_tokens_per_second": 13256.53 }, { "epoch": 0.20120592476078122, "grad_norm": 0.1962890625, "learning_rate": 4.6255264070868434e-05, "loss": 1.534, "num_input_tokens_seen": 201195520, "step": 3070, "train_runtime": 15177.003, "train_tokens_per_second": 13256.604 }, { "epoch": 0.20186131865251017, "grad_norm": 0.177734375, "learning_rate": 4.622728112698821e-05, "loss": 1.5367, "num_input_tokens_seen": 201850880, "step": 3080, "train_runtime": 15227.8401, "train_tokens_per_second": 13255.385 }, { "epoch": 0.20251671254423909, "grad_norm": 0.1884765625, "learning_rate": 4.6199202549214633e-05, "loss": 1.5676, "num_input_tokens_seen": 202506240, "step": 3090, "train_runtime": 15275.9171, "train_tokens_per_second": 13256.568 }, { "epoch": 0.203172106435968, "grad_norm": 0.2060546875, "learning_rate": 4.617102846404831e-05, "loss": 1.5364, "num_input_tokens_seen": 203161600, "step": 3100, "train_runtime": 15326.9769, "train_tokens_per_second": 13255.164 }, { "epoch": 0.20382750032769695, "grad_norm": 0.169921875, "learning_rate": 4.6142758998420096e-05, "loss": 1.5359, "num_input_tokens_seen": 203816960, "step": 3110, "train_runtime": 15374.6805, "train_tokens_per_second": 13256.663 }, { "epoch": 0.20448289421942586, "grad_norm": 0.2099609375, "learning_rate": 4.611439427969062e-05, "loss": 1.535, "num_input_tokens_seen": 204472320, "step": 3120, "train_runtime": 15425.7911, "train_tokens_per_second": 13255.224 }, { "epoch": 0.2051382881111548, "grad_norm": 0.2216796875, "learning_rate": 4.608593443564958e-05, "loss": 1.554, "num_input_tokens_seen": 205127680, "step": 3130, "train_runtime": 15473.3834, "train_tokens_per_second": 13256.808 }, { "epoch": 0.20579368200288373, "grad_norm": 0.16796875, "learning_rate": 4.605737959451528e-05, "loss": 1.5308, "num_input_tokens_seen": 205783040, "step": 3140, "train_runtime": 15524.5191, "train_tokens_per_second": 13255.357 }, { "epoch": 0.20644907589461267, "grad_norm": 0.1787109375, "learning_rate": 4.602872988493399e-05, "loss": 1.533, "num_input_tokens_seen": 206438400, "step": 3150, "train_runtime": 15572.1477, "train_tokens_per_second": 13256.9 }, { "epoch": 0.2071044697863416, "grad_norm": 0.201171875, "learning_rate": 4.599998543597939e-05, "loss": 1.5344, "num_input_tokens_seen": 207093760, "step": 3160, "train_runtime": 15623.096, "train_tokens_per_second": 13255.616 }, { "epoch": 0.20775986367807053, "grad_norm": 0.1904296875, "learning_rate": 4.5971146377151974e-05, "loss": 1.5469, "num_input_tokens_seen": 207749120, "step": 3170, "train_runtime": 15670.6106, "train_tokens_per_second": 13257.245 }, { "epoch": 0.20841525756979945, "grad_norm": 0.2275390625, "learning_rate": 4.5942212838378495e-05, "loss": 1.5285, "num_input_tokens_seen": 208404480, "step": 3180, "train_runtime": 15721.7176, "train_tokens_per_second": 13255.834 }, { "epoch": 0.20907065146152837, "grad_norm": 0.203125, "learning_rate": 4.591318495001135e-05, "loss": 1.5492, "num_input_tokens_seen": 209059840, "step": 3190, "train_runtime": 15768.7728, "train_tokens_per_second": 13257.838 }, { "epoch": 0.2097260453532573, "grad_norm": 0.1806640625, "learning_rate": 4.5884062842828e-05, "loss": 1.5425, "num_input_tokens_seen": 209715200, "step": 3200, "train_runtime": 15819.8444, "train_tokens_per_second": 13256.464 }, { "epoch": 0.21038143924498623, "grad_norm": 0.23046875, "learning_rate": 4.5854846648030375e-05, "loss": 1.5254, "num_input_tokens_seen": 210370560, "step": 3210, "train_runtime": 15867.7451, "train_tokens_per_second": 13257.748 }, { "epoch": 0.21103683313671517, "grad_norm": 0.201171875, "learning_rate": 4.582553649724432e-05, "loss": 1.5353, "num_input_tokens_seen": 211025920, "step": 3220, "train_runtime": 15918.8081, "train_tokens_per_second": 13256.389 }, { "epoch": 0.2116922270284441, "grad_norm": 0.20703125, "learning_rate": 4.579613252251894e-05, "loss": 1.5346, "num_input_tokens_seen": 211681280, "step": 3230, "train_runtime": 15966.4803, "train_tokens_per_second": 13257.855 }, { "epoch": 0.21234762092017304, "grad_norm": 0.212890625, "learning_rate": 4.5766634856326074e-05, "loss": 1.5241, "num_input_tokens_seen": 212336640, "step": 3240, "train_runtime": 16020.155, "train_tokens_per_second": 13254.344 }, { "epoch": 0.21300301481190195, "grad_norm": 0.1904296875, "learning_rate": 4.573704363155964e-05, "loss": 1.5292, "num_input_tokens_seen": 212992000, "step": 3250, "train_runtime": 16067.4404, "train_tokens_per_second": 13256.125 }, { "epoch": 0.21365840870363087, "grad_norm": 0.1865234375, "learning_rate": 4.570735898153505e-05, "loss": 1.5515, "num_input_tokens_seen": 213647360, "step": 3260, "train_runtime": 16118.2718, "train_tokens_per_second": 13254.979 }, { "epoch": 0.21431380259535981, "grad_norm": 0.18359375, "learning_rate": 4.567758103998865e-05, "loss": 1.5451, "num_input_tokens_seen": 214302720, "step": 3270, "train_runtime": 16166.0938, "train_tokens_per_second": 13256.308 }, { "epoch": 0.21496919648708873, "grad_norm": 0.19921875, "learning_rate": 4.564770994107706e-05, "loss": 1.5483, "num_input_tokens_seen": 214958080, "step": 3280, "train_runtime": 16217.0655, "train_tokens_per_second": 13255.054 }, { "epoch": 0.21562459037881768, "grad_norm": 0.2099609375, "learning_rate": 4.561774581937661e-05, "loss": 1.5559, "num_input_tokens_seen": 215613440, "step": 3290, "train_runtime": 16264.5678, "train_tokens_per_second": 13256.635 }, { "epoch": 0.2162799842705466, "grad_norm": 0.1767578125, "learning_rate": 4.558768880988271e-05, "loss": 1.5509, "num_input_tokens_seen": 216268800, "step": 3300, "train_runtime": 16315.611, "train_tokens_per_second": 13255.33 }, { "epoch": 0.21693537816227554, "grad_norm": 0.21875, "learning_rate": 4.555753904800927e-05, "loss": 1.544, "num_input_tokens_seen": 216924160, "step": 3310, "train_runtime": 16363.2636, "train_tokens_per_second": 13256.778 }, { "epoch": 0.21759077205400446, "grad_norm": 0.197265625, "learning_rate": 4.552729666958803e-05, "loss": 1.5335, "num_input_tokens_seen": 217579520, "step": 3320, "train_runtime": 16414.4723, "train_tokens_per_second": 13255.347 }, { "epoch": 0.21824616594573337, "grad_norm": 0.224609375, "learning_rate": 4.5496961810868044e-05, "loss": 1.5209, "num_input_tokens_seen": 218234880, "step": 3330, "train_runtime": 16462.192, "train_tokens_per_second": 13256.733 }, { "epoch": 0.21890155983746232, "grad_norm": 0.171875, "learning_rate": 4.546653460851496e-05, "loss": 1.5449, "num_input_tokens_seen": 218890240, "step": 3340, "train_runtime": 16513.2528, "train_tokens_per_second": 13255.428 }, { "epoch": 0.21955695372919123, "grad_norm": 0.1865234375, "learning_rate": 4.5436015199610515e-05, "loss": 1.5597, "num_input_tokens_seen": 219545600, "step": 3350, "train_runtime": 16560.7291, "train_tokens_per_second": 13257.001 }, { "epoch": 0.22021234762092018, "grad_norm": 0.1650390625, "learning_rate": 4.540540372165178e-05, "loss": 1.5398, "num_input_tokens_seen": 220200960, "step": 3360, "train_runtime": 16611.8103, "train_tokens_per_second": 13255.687 }, { "epoch": 0.2208677415126491, "grad_norm": 0.177734375, "learning_rate": 4.5374700312550696e-05, "loss": 1.5739, "num_input_tokens_seen": 220856320, "step": 3370, "train_runtime": 16659.4206, "train_tokens_per_second": 13257.143 }, { "epoch": 0.22152313540437804, "grad_norm": 0.1806640625, "learning_rate": 4.534390511063333e-05, "loss": 1.574, "num_input_tokens_seen": 221511680, "step": 3380, "train_runtime": 16710.6249, "train_tokens_per_second": 13255.739 }, { "epoch": 0.22217852929610696, "grad_norm": 0.2314453125, "learning_rate": 4.53130182546393e-05, "loss": 1.5351, "num_input_tokens_seen": 222167040, "step": 3390, "train_runtime": 16758.4358, "train_tokens_per_second": 13257.027 }, { "epoch": 0.2228339231878359, "grad_norm": 0.1650390625, "learning_rate": 4.528203988372116e-05, "loss": 1.5385, "num_input_tokens_seen": 222822400, "step": 3400, "train_runtime": 16809.5486, "train_tokens_per_second": 13255.704 }, { "epoch": 0.22348931707956482, "grad_norm": 0.181640625, "learning_rate": 4.525097013744377e-05, "loss": 1.5377, "num_input_tokens_seen": 223477760, "step": 3410, "train_runtime": 16856.7785, "train_tokens_per_second": 13257.442 }, { "epoch": 0.22414471097129374, "grad_norm": 0.1845703125, "learning_rate": 4.521980915578362e-05, "loss": 1.542, "num_input_tokens_seen": 224133120, "step": 3420, "train_runtime": 16907.7757, "train_tokens_per_second": 13256.216 }, { "epoch": 0.22480010486302268, "grad_norm": 0.2001953125, "learning_rate": 4.518855707912828e-05, "loss": 1.5416, "num_input_tokens_seen": 224788480, "step": 3430, "train_runtime": 16955.4795, "train_tokens_per_second": 13257.571 }, { "epoch": 0.2254554987547516, "grad_norm": 0.1884765625, "learning_rate": 4.51572140482757e-05, "loss": 1.5448, "num_input_tokens_seen": 225443840, "step": 3440, "train_runtime": 17006.6353, "train_tokens_per_second": 13256.228 }, { "epoch": 0.22611089264648054, "grad_norm": 0.2431640625, "learning_rate": 4.512578020443359e-05, "loss": 1.5327, "num_input_tokens_seen": 226099200, "step": 3450, "train_runtime": 17054.4533, "train_tokens_per_second": 13257.487 }, { "epoch": 0.22676628653820946, "grad_norm": 0.1953125, "learning_rate": 4.509425568921883e-05, "loss": 1.5396, "num_input_tokens_seen": 226754560, "step": 3460, "train_runtime": 17105.5055, "train_tokens_per_second": 13256.233 }, { "epoch": 0.2274216804299384, "grad_norm": 0.16796875, "learning_rate": 4.506264064465676e-05, "loss": 1.5448, "num_input_tokens_seen": 227409920, "step": 3470, "train_runtime": 17152.7025, "train_tokens_per_second": 13257.964 }, { "epoch": 0.22807707432166732, "grad_norm": 0.2060546875, "learning_rate": 4.503093521318059e-05, "loss": 1.5243, "num_input_tokens_seen": 228065280, "step": 3480, "train_runtime": 17203.7605, "train_tokens_per_second": 13256.711 }, { "epoch": 0.22873246821339624, "grad_norm": 0.1845703125, "learning_rate": 4.4999139537630766e-05, "loss": 1.5409, "num_input_tokens_seen": 228720640, "step": 3490, "train_runtime": 17251.6087, "train_tokens_per_second": 13257.931 }, { "epoch": 0.22938786210512518, "grad_norm": 0.169921875, "learning_rate": 4.496725376125425e-05, "loss": 1.5229, "num_input_tokens_seen": 229376000, "step": 3500, "train_runtime": 17302.7179, "train_tokens_per_second": 13256.646 }, { "epoch": 0.2300432559968541, "grad_norm": 0.2119140625, "learning_rate": 4.493527802770399e-05, "loss": 1.5259, "num_input_tokens_seen": 230031360, "step": 3510, "train_runtime": 17350.4608, "train_tokens_per_second": 13257.94 }, { "epoch": 0.23069864988858305, "grad_norm": 0.2138671875, "learning_rate": 4.4903212481038176e-05, "loss": 1.5404, "num_input_tokens_seen": 230686720, "step": 3520, "train_runtime": 17401.4888, "train_tokens_per_second": 13256.723 }, { "epoch": 0.23135404378031196, "grad_norm": 0.19921875, "learning_rate": 4.487105726571963e-05, "loss": 1.5387, "num_input_tokens_seen": 231342080, "step": 3530, "train_runtime": 17448.894, "train_tokens_per_second": 13258.266 }, { "epoch": 0.2320094376720409, "grad_norm": 0.1904296875, "learning_rate": 4.483881252661517e-05, "loss": 1.5466, "num_input_tokens_seen": 231997440, "step": 3540, "train_runtime": 17499.9463, "train_tokens_per_second": 13257.037 }, { "epoch": 0.23266483156376982, "grad_norm": 0.1708984375, "learning_rate": 4.4806478408994926e-05, "loss": 1.5229, "num_input_tokens_seen": 232652800, "step": 3550, "train_runtime": 17547.8683, "train_tokens_per_second": 13258.18 }, { "epoch": 0.23332022545549874, "grad_norm": 0.1953125, "learning_rate": 4.47740550585317e-05, "loss": 1.5332, "num_input_tokens_seen": 233308160, "step": 3560, "train_runtime": 17598.9063, "train_tokens_per_second": 13256.969 }, { "epoch": 0.2339756193472277, "grad_norm": 0.1767578125, "learning_rate": 4.474154262130029e-05, "loss": 1.5248, "num_input_tokens_seen": 233963520, "step": 3570, "train_runtime": 17646.6173, "train_tokens_per_second": 13258.265 }, { "epoch": 0.2346310132389566, "grad_norm": 0.197265625, "learning_rate": 4.470894124377691e-05, "loss": 1.53, "num_input_tokens_seen": 234618880, "step": 3580, "train_runtime": 17697.6489, "train_tokens_per_second": 13257.065 }, { "epoch": 0.23528640713068555, "grad_norm": 0.2001953125, "learning_rate": 4.467625107283841e-05, "loss": 1.56, "num_input_tokens_seen": 235274240, "step": 3590, "train_runtime": 17745.1419, "train_tokens_per_second": 13258.516 }, { "epoch": 0.23594180102241447, "grad_norm": 0.1767578125, "learning_rate": 4.464347225576169e-05, "loss": 1.5421, "num_input_tokens_seen": 235929600, "step": 3600, "train_runtime": 17796.1973, "train_tokens_per_second": 13257.304 }, { "epoch": 0.2365971949141434, "grad_norm": 0.212890625, "learning_rate": 4.461060494022306e-05, "loss": 1.547, "num_input_tokens_seen": 236584960, "step": 3610, "train_runtime": 17844.3069, "train_tokens_per_second": 13258.288 }, { "epoch": 0.23725258880587233, "grad_norm": 0.224609375, "learning_rate": 4.457764927429747e-05, "loss": 1.5441, "num_input_tokens_seen": 237240320, "step": 3620, "train_runtime": 17895.0806, "train_tokens_per_second": 13257.293 }, { "epoch": 0.23790798269760124, "grad_norm": 0.2314453125, "learning_rate": 4.454460540645796e-05, "loss": 1.5499, "num_input_tokens_seen": 237895680, "step": 3630, "train_runtime": 17943.9975, "train_tokens_per_second": 13257.675 }, { "epoch": 0.2385633765893302, "grad_norm": 0.22265625, "learning_rate": 4.451147348557493e-05, "loss": 1.5462, "num_input_tokens_seen": 238551040, "step": 3640, "train_runtime": 17994.0076, "train_tokens_per_second": 13257.249 }, { "epoch": 0.2392187704810591, "grad_norm": 0.18359375, "learning_rate": 4.447825366091547e-05, "loss": 1.531, "num_input_tokens_seen": 239206400, "step": 3650, "train_runtime": 18043.0124, "train_tokens_per_second": 13257.564 }, { "epoch": 0.23987416437278805, "grad_norm": 0.166015625, "learning_rate": 4.4444946082142705e-05, "loss": 1.5582, "num_input_tokens_seen": 239861760, "step": 3660, "train_runtime": 18092.4908, "train_tokens_per_second": 13257.531 }, { "epoch": 0.24052955826451697, "grad_norm": 0.2099609375, "learning_rate": 4.441155089931511e-05, "loss": 1.5369, "num_input_tokens_seen": 240517120, "step": 3670, "train_runtime": 18141.5662, "train_tokens_per_second": 13257.792 }, { "epoch": 0.2411849521562459, "grad_norm": 0.158203125, "learning_rate": 4.437806826288584e-05, "loss": 1.528, "num_input_tokens_seen": 241172480, "step": 3680, "train_runtime": 18190.9954, "train_tokens_per_second": 13257.795 }, { "epoch": 0.24184034604797483, "grad_norm": 0.2265625, "learning_rate": 4.434449832370203e-05, "loss": 1.5457, "num_input_tokens_seen": 241827840, "step": 3690, "train_runtime": 18240.4318, "train_tokens_per_second": 13257.791 }, { "epoch": 0.24249573993970377, "grad_norm": 0.1806640625, "learning_rate": 4.431084123300416e-05, "loss": 1.5326, "num_input_tokens_seen": 242483200, "step": 3700, "train_runtime": 18289.7232, "train_tokens_per_second": 13257.893 }, { "epoch": 0.2431511338314327, "grad_norm": 0.19921875, "learning_rate": 4.427709714242535e-05, "loss": 1.5398, "num_input_tokens_seen": 243138560, "step": 3710, "train_runtime": 18340.5446, "train_tokens_per_second": 13256.889 }, { "epoch": 0.2438065277231616, "grad_norm": 0.2177734375, "learning_rate": 4.424326620399065e-05, "loss": 1.5324, "num_input_tokens_seen": 243793920, "step": 3720, "train_runtime": 18389.9746, "train_tokens_per_second": 13256.893 }, { "epoch": 0.24446192161489055, "grad_norm": 0.193359375, "learning_rate": 4.420934857011639e-05, "loss": 1.5275, "num_input_tokens_seen": 244449280, "step": 3730, "train_runtime": 18439.4003, "train_tokens_per_second": 13256.9 }, { "epoch": 0.24511731550661947, "grad_norm": 0.2275390625, "learning_rate": 4.4175344393609506e-05, "loss": 1.5291, "num_input_tokens_seen": 245104640, "step": 3740, "train_runtime": 18488.6274, "train_tokens_per_second": 13257.049 }, { "epoch": 0.24577270939834842, "grad_norm": 0.20703125, "learning_rate": 4.4141253827666794e-05, "loss": 1.553, "num_input_tokens_seen": 245760000, "step": 3750, "train_runtime": 18538.0921, "train_tokens_per_second": 13257.028 }, { "epoch": 0.24642810329007733, "grad_norm": 0.2177734375, "learning_rate": 4.410707702587427e-05, "loss": 1.536, "num_input_tokens_seen": 246415360, "step": 3760, "train_runtime": 18587.2885, "train_tokens_per_second": 13257.198 }, { "epoch": 0.24708349718180628, "grad_norm": 0.1962890625, "learning_rate": 4.4072814142206465e-05, "loss": 1.52, "num_input_tokens_seen": 247070720, "step": 3770, "train_runtime": 18637.3813, "train_tokens_per_second": 13256.729 }, { "epoch": 0.2477388910735352, "grad_norm": 0.189453125, "learning_rate": 4.4038465331025734e-05, "loss": 1.5369, "num_input_tokens_seen": 247726080, "step": 3780, "train_runtime": 18686.1365, "train_tokens_per_second": 13257.212 }, { "epoch": 0.2483942849652641, "grad_norm": 0.1875, "learning_rate": 4.400403074708155e-05, "loss": 1.5142, "num_input_tokens_seen": 248381440, "step": 3790, "train_runtime": 18737.1149, "train_tokens_per_second": 13256.12 }, { "epoch": 0.24904967885699306, "grad_norm": 0.1845703125, "learning_rate": 4.39695105455098e-05, "loss": 1.5484, "num_input_tokens_seen": 249036800, "step": 3800, "train_runtime": 18784.925, "train_tokens_per_second": 13257.269 }, { "epoch": 0.24970507274872197, "grad_norm": 0.216796875, "learning_rate": 4.3934904881832106e-05, "loss": 1.5537, "num_input_tokens_seen": 249692160, "step": 3810, "train_runtime": 18835.8775, "train_tokens_per_second": 13256.2 }, { "epoch": 0.2503604666404509, "grad_norm": 0.173828125, "learning_rate": 4.390021391195514e-05, "loss": 1.5429, "num_input_tokens_seen": 250347520, "step": 3820, "train_runtime": 18883.2766, "train_tokens_per_second": 13257.631 }, { "epoch": 0.25101586053217984, "grad_norm": 0.166015625, "learning_rate": 4.3865437792169874e-05, "loss": 1.5065, "num_input_tokens_seen": 251002880, "step": 3830, "train_runtime": 18934.135, "train_tokens_per_second": 13256.633 }, { "epoch": 0.25167125442390875, "grad_norm": 0.189453125, "learning_rate": 4.383057667915089e-05, "loss": 1.5435, "num_input_tokens_seen": 251658240, "step": 3840, "train_runtime": 18981.6823, "train_tokens_per_second": 13257.952 }, { "epoch": 0.2523266483156377, "grad_norm": 0.201171875, "learning_rate": 4.3795630729955714e-05, "loss": 1.5372, "num_input_tokens_seen": 252313600, "step": 3850, "train_runtime": 19032.3082, "train_tokens_per_second": 13257.12 }, { "epoch": 0.25298204220736664, "grad_norm": 0.181640625, "learning_rate": 4.376060010202407e-05, "loss": 1.5377, "num_input_tokens_seen": 252968960, "step": 3860, "train_runtime": 19079.6192, "train_tokens_per_second": 13258.596 }, { "epoch": 0.25363743609909556, "grad_norm": 0.1875, "learning_rate": 4.372548495317716e-05, "loss": 1.5354, "num_input_tokens_seen": 253624320, "step": 3870, "train_runtime": 19130.4345, "train_tokens_per_second": 13257.635 }, { "epoch": 0.2542928299908245, "grad_norm": 0.21484375, "learning_rate": 4.369028544161701e-05, "loss": 1.5399, "num_input_tokens_seen": 254279680, "step": 3880, "train_runtime": 19178.0374, "train_tokens_per_second": 13258.9 }, { "epoch": 0.2549482238825534, "grad_norm": 0.201171875, "learning_rate": 4.365500172592566e-05, "loss": 1.5412, "num_input_tokens_seen": 254935040, "step": 3890, "train_runtime": 19228.8258, "train_tokens_per_second": 13257.962 }, { "epoch": 0.25560361777428237, "grad_norm": 0.220703125, "learning_rate": 4.3619633965064585e-05, "loss": 1.5297, "num_input_tokens_seen": 255590400, "step": 3900, "train_runtime": 19276.0598, "train_tokens_per_second": 13259.473 }, { "epoch": 0.2562590116660113, "grad_norm": 0.1875, "learning_rate": 4.358418231837384e-05, "loss": 1.5374, "num_input_tokens_seen": 256245760, "step": 3910, "train_runtime": 19327.0348, "train_tokens_per_second": 13258.41 }, { "epoch": 0.2569144055577402, "grad_norm": 0.1962890625, "learning_rate": 4.354864694557144e-05, "loss": 1.5445, "num_input_tokens_seen": 256901120, "step": 3920, "train_runtime": 19374.5183, "train_tokens_per_second": 13259.742 }, { "epoch": 0.2575697994494691, "grad_norm": 0.1611328125, "learning_rate": 4.35130280067526e-05, "loss": 1.5462, "num_input_tokens_seen": 257556480, "step": 3930, "train_runtime": 19425.5821, "train_tokens_per_second": 13258.624 }, { "epoch": 0.25822519334119803, "grad_norm": 0.2060546875, "learning_rate": 4.347732566238901e-05, "loss": 1.5328, "num_input_tokens_seen": 258211840, "step": 3940, "train_runtime": 19472.7448, "train_tokens_per_second": 13260.167 }, { "epoch": 0.258880587232927, "grad_norm": 0.1806640625, "learning_rate": 4.344154007332813e-05, "loss": 1.5575, "num_input_tokens_seen": 258867200, "step": 3950, "train_runtime": 19523.7821, "train_tokens_per_second": 13259.07 }, { "epoch": 0.2595359811246559, "grad_norm": 0.20703125, "learning_rate": 4.3405671400792434e-05, "loss": 1.5448, "num_input_tokens_seen": 259522560, "step": 3960, "train_runtime": 19571.2867, "train_tokens_per_second": 13260.373 }, { "epoch": 0.26019137501638484, "grad_norm": 0.1845703125, "learning_rate": 4.336971980637876e-05, "loss": 1.5263, "num_input_tokens_seen": 260177920, "step": 3970, "train_runtime": 19622.1556, "train_tokens_per_second": 13259.395 }, { "epoch": 0.26084676890811376, "grad_norm": 0.181640625, "learning_rate": 4.333368545205748e-05, "loss": 1.5337, "num_input_tokens_seen": 260833280, "step": 3980, "train_runtime": 19669.3739, "train_tokens_per_second": 13260.884 }, { "epoch": 0.26150216279984273, "grad_norm": 0.2255859375, "learning_rate": 4.3297568500171827e-05, "loss": 1.5613, "num_input_tokens_seen": 261488640, "step": 3990, "train_runtime": 19720.3793, "train_tokens_per_second": 13259.818 }, { "epoch": 0.26215755669157165, "grad_norm": 0.1748046875, "learning_rate": 4.326136911343718e-05, "loss": 1.532, "num_input_tokens_seen": 262144000, "step": 4000, "train_runtime": 19767.5333, "train_tokens_per_second": 13261.341 }, { "epoch": 0.26281295058330056, "grad_norm": 0.1767578125, "learning_rate": 4.322508745494027e-05, "loss": 1.5337, "num_input_tokens_seen": 262799360, "step": 4010, "train_runtime": 19818.5765, "train_tokens_per_second": 13260.254 }, { "epoch": 0.2634683444750295, "grad_norm": 0.205078125, "learning_rate": 4.318872368813851e-05, "loss": 1.5401, "num_input_tokens_seen": 263454720, "step": 4020, "train_runtime": 19865.6814, "train_tokens_per_second": 13261.801 }, { "epoch": 0.2641237383667584, "grad_norm": 0.2216796875, "learning_rate": 4.3152277976859226e-05, "loss": 1.5662, "num_input_tokens_seen": 264110080, "step": 4030, "train_runtime": 19916.6395, "train_tokens_per_second": 13260.775 }, { "epoch": 0.26477913225848737, "grad_norm": 0.177734375, "learning_rate": 4.311575048529891e-05, "loss": 1.5429, "num_input_tokens_seen": 264765440, "step": 4040, "train_runtime": 19964.1731, "train_tokens_per_second": 13262.029 }, { "epoch": 0.2654345261502163, "grad_norm": 0.2041015625, "learning_rate": 4.3079141378022494e-05, "loss": 1.5062, "num_input_tokens_seen": 265420800, "step": 4050, "train_runtime": 20022.0631, "train_tokens_per_second": 13256.416 }, { "epoch": 0.2660899200419452, "grad_norm": 0.169921875, "learning_rate": 4.304245081996264e-05, "loss": 1.5295, "num_input_tokens_seen": 266076160, "step": 4060, "train_runtime": 20069.7228, "train_tokens_per_second": 13257.59 }, { "epoch": 0.2667453139336741, "grad_norm": 0.197265625, "learning_rate": 4.300567897641892e-05, "loss": 1.5337, "num_input_tokens_seen": 266731520, "step": 4070, "train_runtime": 20120.8069, "train_tokens_per_second": 13256.502 }, { "epoch": 0.2674007078254031, "grad_norm": 0.185546875, "learning_rate": 4.296882601305714e-05, "loss": 1.5483, "num_input_tokens_seen": 267386880, "step": 4080, "train_runtime": 20168.1671, "train_tokens_per_second": 13257.867 }, { "epoch": 0.268056101717132, "grad_norm": 0.1865234375, "learning_rate": 4.2931892095908564e-05, "loss": 1.5473, "num_input_tokens_seen": 268042240, "step": 4090, "train_runtime": 20219.2986, "train_tokens_per_second": 13256.753 }, { "epoch": 0.26871149560886093, "grad_norm": 0.1904296875, "learning_rate": 4.289487739136918e-05, "loss": 1.5528, "num_input_tokens_seen": 268697600, "step": 4100, "train_runtime": 20266.8911, "train_tokens_per_second": 13257.958 }, { "epoch": 0.26936688950058985, "grad_norm": 0.2158203125, "learning_rate": 4.2857782066198944e-05, "loss": 1.5487, "num_input_tokens_seen": 269352960, "step": 4110, "train_runtime": 20318.018, "train_tokens_per_second": 13256.852 }, { "epoch": 0.27002228339231876, "grad_norm": 0.193359375, "learning_rate": 4.2820606287521e-05, "loss": 1.5398, "num_input_tokens_seen": 270008320, "step": 4120, "train_runtime": 20366.986, "train_tokens_per_second": 13257.156 }, { "epoch": 0.27067767728404774, "grad_norm": 0.1630859375, "learning_rate": 4.278335022282099e-05, "loss": 1.531, "num_input_tokens_seen": 270663680, "step": 4130, "train_runtime": 20418.1174, "train_tokens_per_second": 13256.055 }, { "epoch": 0.27133307117577665, "grad_norm": 0.2119140625, "learning_rate": 4.2746014039946235e-05, "loss": 1.5613, "num_input_tokens_seen": 271319040, "step": 4140, "train_runtime": 20465.8213, "train_tokens_per_second": 13257.178 }, { "epoch": 0.27198846506750557, "grad_norm": 0.203125, "learning_rate": 4.270859790710503e-05, "loss": 1.5573, "num_input_tokens_seen": 271974400, "step": 4150, "train_runtime": 20517.0361, "train_tokens_per_second": 13256.028 }, { "epoch": 0.2726438589592345, "grad_norm": 0.1923828125, "learning_rate": 4.2671101992865846e-05, "loss": 1.5433, "num_input_tokens_seen": 272629760, "step": 4160, "train_runtime": 20564.6484, "train_tokens_per_second": 13257.205 }, { "epoch": 0.2732992528509634, "grad_norm": 0.1923828125, "learning_rate": 4.2633526466156595e-05, "loss": 1.5494, "num_input_tokens_seen": 273285120, "step": 4170, "train_runtime": 20615.7329, "train_tokens_per_second": 13256.144 }, { "epoch": 0.2739546467426924, "grad_norm": 0.169921875, "learning_rate": 4.2595871496263855e-05, "loss": 1.5651, "num_input_tokens_seen": 273940480, "step": 4180, "train_runtime": 20663.0874, "train_tokens_per_second": 13257.481 }, { "epoch": 0.2746100406344213, "grad_norm": 0.2021484375, "learning_rate": 4.255813725283213e-05, "loss": 1.5597, "num_input_tokens_seen": 274595840, "step": 4190, "train_runtime": 20713.9693, "train_tokens_per_second": 13256.553 }, { "epoch": 0.2752654345261502, "grad_norm": 0.2353515625, "learning_rate": 4.252032390586306e-05, "loss": 1.5486, "num_input_tokens_seen": 275251200, "step": 4200, "train_runtime": 20756.1484, "train_tokens_per_second": 13261.189 }, { "epoch": 0.2759208284178791, "grad_norm": 0.1787109375, "learning_rate": 4.248243162571466e-05, "loss": 1.5635, "num_input_tokens_seen": 275906560, "step": 4210, "train_runtime": 20780.6597, "train_tokens_per_second": 13277.084 }, { "epoch": 0.2765762223096081, "grad_norm": 0.166015625, "learning_rate": 4.2444460583100565e-05, "loss": 1.5528, "num_input_tokens_seen": 276561920, "step": 4220, "train_runtime": 20805.2226, "train_tokens_per_second": 13292.908 }, { "epoch": 0.277231616201337, "grad_norm": 0.1875, "learning_rate": 4.240641094908925e-05, "loss": 1.5586, "num_input_tokens_seen": 277217280, "step": 4230, "train_runtime": 20829.7518, "train_tokens_per_second": 13308.717 }, { "epoch": 0.27788701009306593, "grad_norm": 0.181640625, "learning_rate": 4.2368282895103276e-05, "loss": 1.5403, "num_input_tokens_seen": 277872640, "step": 4240, "train_runtime": 20854.2278, "train_tokens_per_second": 13324.523 }, { "epoch": 0.27854240398479485, "grad_norm": 0.21875, "learning_rate": 4.2330076592918466e-05, "loss": 1.5434, "num_input_tokens_seen": 278528000, "step": 4250, "train_runtime": 20878.7151, "train_tokens_per_second": 13340.285 }, { "epoch": 0.27919779787652377, "grad_norm": 0.2216796875, "learning_rate": 4.229179221466322e-05, "loss": 1.5339, "num_input_tokens_seen": 279183360, "step": 4260, "train_runtime": 20903.3661, "train_tokens_per_second": 13355.904 }, { "epoch": 0.27985319176825274, "grad_norm": 0.2021484375, "learning_rate": 4.2253429932817656e-05, "loss": 1.5494, "num_input_tokens_seen": 279838720, "step": 4270, "train_runtime": 20927.8637, "train_tokens_per_second": 13371.586 }, { "epoch": 0.28050858565998166, "grad_norm": 0.189453125, "learning_rate": 4.221498992021288e-05, "loss": 1.5413, "num_input_tokens_seen": 280494080, "step": 4280, "train_runtime": 20952.3525, "train_tokens_per_second": 13387.236 }, { "epoch": 0.2811639795517106, "grad_norm": 0.19140625, "learning_rate": 4.217647235003018e-05, "loss": 1.5478, "num_input_tokens_seen": 281149440, "step": 4290, "train_runtime": 20976.8405, "train_tokens_per_second": 13402.85 }, { "epoch": 0.2818193734434395, "grad_norm": 0.2080078125, "learning_rate": 4.2137877395800274e-05, "loss": 1.554, "num_input_tokens_seen": 281804800, "step": 4300, "train_runtime": 21001.3432, "train_tokens_per_second": 13418.418 }, { "epoch": 0.2824747673351684, "grad_norm": 0.1884765625, "learning_rate": 4.209920523140251e-05, "loss": 1.5536, "num_input_tokens_seen": 282460160, "step": 4310, "train_runtime": 21025.8315, "train_tokens_per_second": 13433.959 }, { "epoch": 0.2831301612268974, "grad_norm": 0.1982421875, "learning_rate": 4.2060456031064074e-05, "loss": 1.5445, "num_input_tokens_seen": 283115520, "step": 4320, "train_runtime": 21050.47, "train_tokens_per_second": 13449.368 }, { "epoch": 0.2837855551186263, "grad_norm": 0.177734375, "learning_rate": 4.202162996935923e-05, "loss": 1.5173, "num_input_tokens_seen": 283770880, "step": 4330, "train_runtime": 21074.9688, "train_tokens_per_second": 13464.83 }, { "epoch": 0.2844409490103552, "grad_norm": 0.2138671875, "learning_rate": 4.1982727221208517e-05, "loss": 1.5359, "num_input_tokens_seen": 284426240, "step": 4340, "train_runtime": 21099.4536, "train_tokens_per_second": 13480.266 }, { "epoch": 0.28509634290208413, "grad_norm": 0.201171875, "learning_rate": 4.194374796187797e-05, "loss": 1.5598, "num_input_tokens_seen": 285081600, "step": 4350, "train_runtime": 21123.9442, "train_tokens_per_second": 13495.661 }, { "epoch": 0.2857517367938131, "grad_norm": 0.19140625, "learning_rate": 4.19046923669783e-05, "loss": 1.548, "num_input_tokens_seen": 285736960, "step": 4360, "train_runtime": 21148.4359, "train_tokens_per_second": 13511.021 }, { "epoch": 0.286407130685542, "grad_norm": 0.18359375, "learning_rate": 4.186556061246416e-05, "loss": 1.5274, "num_input_tokens_seen": 286392320, "step": 4370, "train_runtime": 21173.0739, "train_tokens_per_second": 13526.251 }, { "epoch": 0.28706252457727094, "grad_norm": 0.2080078125, "learning_rate": 4.182635287463331e-05, "loss": 1.5453, "num_input_tokens_seen": 287047680, "step": 4380, "train_runtime": 21197.582, "train_tokens_per_second": 13541.529 }, { "epoch": 0.28771791846899986, "grad_norm": 0.185546875, "learning_rate": 4.1787069330125826e-05, "loss": 1.5593, "num_input_tokens_seen": 287703040, "step": 4390, "train_runtime": 21222.0766, "train_tokens_per_second": 13556.781 }, { "epoch": 0.2883733123607288, "grad_norm": 0.208984375, "learning_rate": 4.1747710155923317e-05, "loss": 1.55, "num_input_tokens_seen": 288358400, "step": 4400, "train_runtime": 21246.5681, "train_tokens_per_second": 13571.999 }, { "epoch": 0.28902870625245775, "grad_norm": 0.1787109375, "learning_rate": 4.1708275529348104e-05, "loss": 1.5617, "num_input_tokens_seen": 289013760, "step": 4410, "train_runtime": 21271.0491, "train_tokens_per_second": 13587.189 }, { "epoch": 0.28968410014418666, "grad_norm": 0.232421875, "learning_rate": 4.166876562806247e-05, "loss": 1.5523, "num_input_tokens_seen": 289669120, "step": 4420, "train_runtime": 21295.5481, "train_tokens_per_second": 13602.332 }, { "epoch": 0.2903394940359156, "grad_norm": 0.181640625, "learning_rate": 4.16291806300678e-05, "loss": 1.5327, "num_input_tokens_seen": 290324480, "step": 4430, "train_runtime": 21320.1943, "train_tokens_per_second": 13617.347 }, { "epoch": 0.2909948879276445, "grad_norm": 0.173828125, "learning_rate": 4.158952071370382e-05, "loss": 1.5653, "num_input_tokens_seen": 290979840, "step": 4440, "train_runtime": 21344.6808, "train_tokens_per_second": 13632.429 }, { "epoch": 0.29165028181937347, "grad_norm": 0.1728515625, "learning_rate": 4.154978605764779e-05, "loss": 1.5519, "num_input_tokens_seen": 291635200, "step": 4450, "train_runtime": 21369.1786, "train_tokens_per_second": 13647.469 }, { "epoch": 0.2923056757111024, "grad_norm": 0.1826171875, "learning_rate": 4.150997684091367e-05, "loss": 1.5494, "num_input_tokens_seen": 292290560, "step": 4460, "train_runtime": 21393.6713, "train_tokens_per_second": 13662.478 }, { "epoch": 0.2929610696028313, "grad_norm": 0.1708984375, "learning_rate": 4.147009324285135e-05, "loss": 1.5385, "num_input_tokens_seen": 292945920, "step": 4470, "train_runtime": 21418.1719, "train_tokens_per_second": 13677.447 }, { "epoch": 0.2936164634945602, "grad_norm": 0.17578125, "learning_rate": 4.143013544314581e-05, "loss": 1.5514, "num_input_tokens_seen": 293601280, "step": 4480, "train_runtime": 21442.7777, "train_tokens_per_second": 13692.316 }, { "epoch": 0.29427185738628914, "grad_norm": 0.2001953125, "learning_rate": 4.139010362181634e-05, "loss": 1.5679, "num_input_tokens_seen": 294256640, "step": 4490, "train_runtime": 21467.2885, "train_tokens_per_second": 13707.21 }, { "epoch": 0.2949272512780181, "grad_norm": 0.1796875, "learning_rate": 4.134999795921571e-05, "loss": 1.5364, "num_input_tokens_seen": 294912000, "step": 4500, "train_runtime": 21491.7837, "train_tokens_per_second": 13722.081 }, { "epoch": 0.295582645169747, "grad_norm": 0.1904296875, "learning_rate": 4.1309818636029354e-05, "loss": 1.5563, "num_input_tokens_seen": 295567360, "step": 4510, "train_runtime": 21516.2773, "train_tokens_per_second": 13736.919 }, { "epoch": 0.29623803906147594, "grad_norm": 0.21875, "learning_rate": 4.1269565833274584e-05, "loss": 1.5412, "num_input_tokens_seen": 296222720, "step": 4520, "train_runtime": 21540.7693, "train_tokens_per_second": 13751.724 }, { "epoch": 0.29689343295320486, "grad_norm": 0.1943359375, "learning_rate": 4.1229239732299734e-05, "loss": 1.5567, "num_input_tokens_seen": 296878080, "step": 4530, "train_runtime": 21565.2626, "train_tokens_per_second": 13766.495 }, { "epoch": 0.2975488268449338, "grad_norm": 0.2119140625, "learning_rate": 4.118884051478337e-05, "loss": 1.5434, "num_input_tokens_seen": 297533440, "step": 4540, "train_runtime": 21589.8602, "train_tokens_per_second": 13781.166 }, { "epoch": 0.29820422073666275, "grad_norm": 0.19921875, "learning_rate": 4.114836836273348e-05, "loss": 1.5538, "num_input_tokens_seen": 298188800, "step": 4550, "train_runtime": 21614.3465, "train_tokens_per_second": 13795.874 }, { "epoch": 0.29885961462839167, "grad_norm": 0.1923828125, "learning_rate": 4.1107823458486604e-05, "loss": 1.5412, "num_input_tokens_seen": 298844160, "step": 4560, "train_runtime": 21638.8458, "train_tokens_per_second": 13810.541 }, { "epoch": 0.2995150085201206, "grad_norm": 0.17578125, "learning_rate": 4.106720598470709e-05, "loss": 1.5567, "num_input_tokens_seen": 299499520, "step": 4570, "train_runtime": 21663.3322, "train_tokens_per_second": 13825.182 }, { "epoch": 0.3001704024118495, "grad_norm": 0.193359375, "learning_rate": 4.1026516124386193e-05, "loss": 1.5484, "num_input_tokens_seen": 300154880, "step": 4580, "train_runtime": 21695.9129, "train_tokens_per_second": 13834.628 }, { "epoch": 0.3008257963035785, "grad_norm": 0.1904296875, "learning_rate": 4.098575406084133e-05, "loss": 1.592, "num_input_tokens_seen": 300810240, "step": 4590, "train_runtime": 21720.4887, "train_tokens_per_second": 13849.147 }, { "epoch": 0.3014811901953074, "grad_norm": 0.1962890625, "learning_rate": 4.0944919977715156e-05, "loss": 1.565, "num_input_tokens_seen": 301465600, "step": 4600, "train_runtime": 21744.9869, "train_tokens_per_second": 13863.683 }, { "epoch": 0.3021365840870363, "grad_norm": 0.1796875, "learning_rate": 4.090401405897483e-05, "loss": 1.5518, "num_input_tokens_seen": 302120960, "step": 4610, "train_runtime": 21769.4918, "train_tokens_per_second": 13878.182 }, { "epoch": 0.3027919779787652, "grad_norm": 0.1962890625, "learning_rate": 4.086303648891113e-05, "loss": 1.5402, "num_input_tokens_seen": 302776320, "step": 4620, "train_runtime": 21793.9815, "train_tokens_per_second": 13892.657 }, { "epoch": 0.30344737187049414, "grad_norm": 0.1943359375, "learning_rate": 4.082198745213764e-05, "loss": 1.5503, "num_input_tokens_seen": 303431680, "step": 4630, "train_runtime": 21818.4811, "train_tokens_per_second": 13907.095 }, { "epoch": 0.3041027657622231, "grad_norm": 0.1962890625, "learning_rate": 4.078086713358994e-05, "loss": 1.5519, "num_input_tokens_seen": 304087040, "step": 4640, "train_runtime": 21842.9785, "train_tokens_per_second": 13921.501 }, { "epoch": 0.30475815965395203, "grad_norm": 0.1787109375, "learning_rate": 4.073967571852473e-05, "loss": 1.5435, "num_input_tokens_seen": 304742400, "step": 4650, "train_runtime": 21867.6122, "train_tokens_per_second": 13935.788 }, { "epoch": 0.30541355354568095, "grad_norm": 0.2109375, "learning_rate": 4.069841339251902e-05, "loss": 1.5581, "num_input_tokens_seen": 305397760, "step": 4660, "train_runtime": 21892.1136, "train_tokens_per_second": 13950.127 }, { "epoch": 0.30606894743740987, "grad_norm": 0.1748046875, "learning_rate": 4.065708034146929e-05, "loss": 1.56, "num_input_tokens_seen": 306053120, "step": 4670, "train_runtime": 21916.606, "train_tokens_per_second": 13964.44 }, { "epoch": 0.30672434132913884, "grad_norm": 0.2216796875, "learning_rate": 4.061567675159065e-05, "loss": 1.5216, "num_input_tokens_seen": 306708480, "step": 4680, "train_runtime": 21941.1336, "train_tokens_per_second": 13978.698 }, { "epoch": 0.30737973522086776, "grad_norm": 0.17578125, "learning_rate": 4.057420280941602e-05, "loss": 1.5491, "num_input_tokens_seen": 307363840, "step": 4690, "train_runtime": 21965.637, "train_tokens_per_second": 13992.94 }, { "epoch": 0.3080351291125967, "grad_norm": 0.181640625, "learning_rate": 4.053265870179525e-05, "loss": 1.5519, "num_input_tokens_seen": 308019200, "step": 4700, "train_runtime": 21990.141, "train_tokens_per_second": 14007.15 }, { "epoch": 0.3086905230043256, "grad_norm": 0.1826171875, "learning_rate": 4.049104461589432e-05, "loss": 1.579, "num_input_tokens_seen": 308674560, "step": 4710, "train_runtime": 22014.7763, "train_tokens_per_second": 14021.244 }, { "epoch": 0.3093459168960545, "grad_norm": 0.1767578125, "learning_rate": 4.044936073919449e-05, "loss": 1.5486, "num_input_tokens_seen": 309329920, "step": 4720, "train_runtime": 22039.263, "train_tokens_per_second": 14035.402 }, { "epoch": 0.3100013107877835, "grad_norm": 0.1923828125, "learning_rate": 4.040760725949141e-05, "loss": 1.5421, "num_input_tokens_seen": 309985280, "step": 4730, "train_runtime": 22063.7567, "train_tokens_per_second": 14049.524 }, { "epoch": 0.3106567046795124, "grad_norm": 0.19921875, "learning_rate": 4.036578436489432e-05, "loss": 1.5546, "num_input_tokens_seen": 310640640, "step": 4740, "train_runtime": 22088.2535, "train_tokens_per_second": 14063.613 }, { "epoch": 0.3113120985712413, "grad_norm": 0.1708984375, "learning_rate": 4.0323892243825203e-05, "loss": 1.5749, "num_input_tokens_seen": 311296000, "step": 4750, "train_runtime": 22112.7388, "train_tokens_per_second": 14077.677 }, { "epoch": 0.31196749246297023, "grad_norm": 0.19140625, "learning_rate": 4.028193108501791e-05, "loss": 1.5675, "num_input_tokens_seen": 311951360, "step": 4760, "train_runtime": 22137.23, "train_tokens_per_second": 14091.707 }, { "epoch": 0.31262288635469915, "grad_norm": 0.1796875, "learning_rate": 4.023990107751734e-05, "loss": 1.5328, "num_input_tokens_seen": 312606720, "step": 4770, "train_runtime": 22161.8288, "train_tokens_per_second": 14105.637 }, { "epoch": 0.3132782802464281, "grad_norm": 0.19140625, "learning_rate": 4.019780241067856e-05, "loss": 1.5463, "num_input_tokens_seen": 313262080, "step": 4780, "train_runtime": 22186.3174, "train_tokens_per_second": 14119.607 }, { "epoch": 0.31393367413815704, "grad_norm": 0.1650390625, "learning_rate": 4.015563527416595e-05, "loss": 1.5584, "num_input_tokens_seen": 313917440, "step": 4790, "train_runtime": 22210.8112, "train_tokens_per_second": 14133.542 }, { "epoch": 0.31458906802988595, "grad_norm": 0.1796875, "learning_rate": 4.011339985795239e-05, "loss": 1.5453, "num_input_tokens_seen": 314572800, "step": 4800, "train_runtime": 22235.2946, "train_tokens_per_second": 14147.454 }, { "epoch": 0.31524446192161487, "grad_norm": 0.197265625, "learning_rate": 4.007109635231836e-05, "loss": 1.5488, "num_input_tokens_seen": 315228160, "step": 4810, "train_runtime": 22259.7862, "train_tokens_per_second": 14161.329 }, { "epoch": 0.31589985581334384, "grad_norm": 0.2099609375, "learning_rate": 4.0028724947851095e-05, "loss": 1.5404, "num_input_tokens_seen": 315883520, "step": 4820, "train_runtime": 22284.4276, "train_tokens_per_second": 14175.079 }, { "epoch": 0.31655524970507276, "grad_norm": 0.173828125, "learning_rate": 3.998628583544374e-05, "loss": 1.5639, "num_input_tokens_seen": 316538880, "step": 4830, "train_runtime": 22308.9267, "train_tokens_per_second": 14188.889 }, { "epoch": 0.3172106435968017, "grad_norm": 0.2333984375, "learning_rate": 3.994377920629448e-05, "loss": 1.5287, "num_input_tokens_seen": 317194240, "step": 4840, "train_runtime": 22333.4326, "train_tokens_per_second": 14202.664 }, { "epoch": 0.3178660374885306, "grad_norm": 0.205078125, "learning_rate": 3.990120525190567e-05, "loss": 1.5551, "num_input_tokens_seen": 317849600, "step": 4850, "train_runtime": 22357.922, "train_tokens_per_second": 14216.42 }, { "epoch": 0.3185214313802595, "grad_norm": 0.1806640625, "learning_rate": 3.985856416408299e-05, "loss": 1.5592, "num_input_tokens_seen": 318504960, "step": 4860, "train_runtime": 22388.0012, "train_tokens_per_second": 14226.592 }, { "epoch": 0.3191768252719885, "grad_norm": 0.1943359375, "learning_rate": 3.9815856134934557e-05, "loss": 1.5407, "num_input_tokens_seen": 319160320, "step": 4870, "train_runtime": 22412.5892, "train_tokens_per_second": 14240.225 }, { "epoch": 0.3198322191637174, "grad_norm": 0.1884765625, "learning_rate": 3.97730813568701e-05, "loss": 1.5569, "num_input_tokens_seen": 319815680, "step": 4880, "train_runtime": 22437.2339, "train_tokens_per_second": 14253.793 }, { "epoch": 0.3204876130554463, "grad_norm": 0.2138671875, "learning_rate": 3.973024002260005e-05, "loss": 1.5603, "num_input_tokens_seen": 320471040, "step": 4890, "train_runtime": 22461.7622, "train_tokens_per_second": 14267.404 }, { "epoch": 0.32114300694717524, "grad_norm": 0.203125, "learning_rate": 3.96873323251347e-05, "loss": 1.5323, "num_input_tokens_seen": 321126400, "step": 4900, "train_runtime": 22486.305, "train_tokens_per_second": 14280.977 }, { "epoch": 0.3217984008389042, "grad_norm": 0.171875, "learning_rate": 3.9644358457783294e-05, "loss": 1.5341, "num_input_tokens_seen": 321781760, "step": 4910, "train_runtime": 22510.9661, "train_tokens_per_second": 14294.445 }, { "epoch": 0.3224537947306331, "grad_norm": 0.1689453125, "learning_rate": 3.9601318614153225e-05, "loss": 1.551, "num_input_tokens_seen": 322437120, "step": 4920, "train_runtime": 22535.4844, "train_tokens_per_second": 14307.974 }, { "epoch": 0.32310918862236204, "grad_norm": 0.1845703125, "learning_rate": 3.955821298814912e-05, "loss": 1.5322, "num_input_tokens_seen": 323092480, "step": 4930, "train_runtime": 22560.0067, "train_tokens_per_second": 14321.471 }, { "epoch": 0.32376458251409096, "grad_norm": 0.1904296875, "learning_rate": 3.951504177397195e-05, "loss": 1.5388, "num_input_tokens_seen": 323747840, "step": 4940, "train_runtime": 22584.6632, "train_tokens_per_second": 14334.854 }, { "epoch": 0.3244199764058199, "grad_norm": 0.208984375, "learning_rate": 3.947180516611817e-05, "loss": 1.5529, "num_input_tokens_seen": 324403200, "step": 4950, "train_runtime": 22609.2157, "train_tokens_per_second": 14348.273 }, { "epoch": 0.32507537029754885, "grad_norm": 0.2236328125, "learning_rate": 3.94285033593789e-05, "loss": 1.5805, "num_input_tokens_seen": 325058560, "step": 4960, "train_runtime": 22633.721, "train_tokens_per_second": 14361.693 }, { "epoch": 0.32573076418927777, "grad_norm": 0.1826171875, "learning_rate": 3.9385136548838944e-05, "loss": 1.554, "num_input_tokens_seen": 325713920, "step": 4970, "train_runtime": 22658.2369, "train_tokens_per_second": 14375.078 }, { "epoch": 0.3263861580810067, "grad_norm": 0.224609375, "learning_rate": 3.9341704929875975e-05, "loss": 1.548, "num_input_tokens_seen": 326369280, "step": 4980, "train_runtime": 22682.7524, "train_tokens_per_second": 14388.434 }, { "epoch": 0.3270415519727356, "grad_norm": 0.2099609375, "learning_rate": 3.9298208698159674e-05, "loss": 1.5444, "num_input_tokens_seen": 327024640, "step": 4990, "train_runtime": 22707.3997, "train_tokens_per_second": 14401.677 }, { "epoch": 0.3276969458644645, "grad_norm": 0.173828125, "learning_rate": 3.925464804965077e-05, "loss": 1.5463, "num_input_tokens_seen": 327680000, "step": 5000, "train_runtime": 22731.908, "train_tokens_per_second": 14414.98 }, { "epoch": 0.3283523397561935, "grad_norm": 0.197265625, "learning_rate": 3.921102318060023e-05, "loss": 1.5152, "num_input_tokens_seen": 328335360, "step": 5010, "train_runtime": 22756.3904, "train_tokens_per_second": 14428.271 }, { "epoch": 0.3290077336479224, "grad_norm": 0.224609375, "learning_rate": 3.916733428754836e-05, "loss": 1.5177, "num_input_tokens_seen": 328990720, "step": 5020, "train_runtime": 22780.8705, "train_tokens_per_second": 14441.534 }, { "epoch": 0.3296631275396513, "grad_norm": 0.2138671875, "learning_rate": 3.912358156732389e-05, "loss": 1.5292, "num_input_tokens_seen": 329646080, "step": 5030, "train_runtime": 22805.3817, "train_tokens_per_second": 14454.75 }, { "epoch": 0.33031852143138024, "grad_norm": 0.212890625, "learning_rate": 3.9079765217043115e-05, "loss": 1.5365, "num_input_tokens_seen": 330301440, "step": 5040, "train_runtime": 22829.8822, "train_tokens_per_second": 14467.943 }, { "epoch": 0.3309739153231092, "grad_norm": 0.185546875, "learning_rate": 3.903588543410901e-05, "loss": 1.5301, "num_input_tokens_seen": 330956800, "step": 5050, "train_runtime": 22854.5149, "train_tokens_per_second": 14481.025 }, { "epoch": 0.33162930921483813, "grad_norm": 0.1923828125, "learning_rate": 3.899194241621029e-05, "loss": 1.5334, "num_input_tokens_seen": 331612160, "step": 5060, "train_runtime": 22879.0465, "train_tokens_per_second": 14494.142 }, { "epoch": 0.33228470310656705, "grad_norm": 0.2099609375, "learning_rate": 3.8947936361320605e-05, "loss": 1.5499, "num_input_tokens_seen": 332267520, "step": 5070, "train_runtime": 22903.5252, "train_tokens_per_second": 14507.265 }, { "epoch": 0.33294009699829596, "grad_norm": 0.2021484375, "learning_rate": 3.8903867467697565e-05, "loss": 1.5231, "num_input_tokens_seen": 332922880, "step": 5080, "train_runtime": 22928.0221, "train_tokens_per_second": 14520.349 }, { "epoch": 0.3335954908900249, "grad_norm": 0.189453125, "learning_rate": 3.885973593388191e-05, "loss": 1.5454, "num_input_tokens_seen": 333578240, "step": 5090, "train_runtime": 22952.4755, "train_tokens_per_second": 14533.432 }, { "epoch": 0.33425088478175385, "grad_norm": 0.2294921875, "learning_rate": 3.881554195869658e-05, "loss": 1.5403, "num_input_tokens_seen": 334233600, "step": 5100, "train_runtime": 22977.0577, "train_tokens_per_second": 14546.406 }, { "epoch": 0.33490627867348277, "grad_norm": 0.1865234375, "learning_rate": 3.877128574124583e-05, "loss": 1.5524, "num_input_tokens_seen": 334888960, "step": 5110, "train_runtime": 23001.5118, "train_tokens_per_second": 14559.433 }, { "epoch": 0.3355616725652117, "grad_norm": 0.205078125, "learning_rate": 3.8726967480914306e-05, "loss": 1.5433, "num_input_tokens_seen": 335544320, "step": 5120, "train_runtime": 23025.9988, "train_tokens_per_second": 14572.411 }, { "epoch": 0.3362170664569406, "grad_norm": 0.2294921875, "learning_rate": 3.868258737736621e-05, "loss": 1.5493, "num_input_tokens_seen": 336199680, "step": 5130, "train_runtime": 23050.4996, "train_tokens_per_second": 14585.353 }, { "epoch": 0.3368724603486696, "grad_norm": 0.1826171875, "learning_rate": 3.8638145630544345e-05, "loss": 1.5557, "num_input_tokens_seen": 336855040, "step": 5140, "train_runtime": 23074.9533, "train_tokens_per_second": 14598.298 }, { "epoch": 0.3375278542403985, "grad_norm": 0.2080078125, "learning_rate": 3.859364244066922e-05, "loss": 1.5375, "num_input_tokens_seen": 337510400, "step": 5150, "train_runtime": 23099.4188, "train_tokens_per_second": 14611.207 }, { "epoch": 0.3381832481321274, "grad_norm": 0.2119140625, "learning_rate": 3.854907800823818e-05, "loss": 1.5556, "num_input_tokens_seen": 338165760, "step": 5160, "train_runtime": 23123.9614, "train_tokens_per_second": 14624.041 }, { "epoch": 0.33883864202385633, "grad_norm": 0.244140625, "learning_rate": 3.850445253402447e-05, "loss": 1.5256, "num_input_tokens_seen": 338821120, "step": 5170, "train_runtime": 23148.4136, "train_tokens_per_second": 14636.905 }, { "epoch": 0.33949403591558525, "grad_norm": 0.1845703125, "learning_rate": 3.845976621907635e-05, "loss": 1.5537, "num_input_tokens_seen": 339476480, "step": 5180, "train_runtime": 23172.8858, "train_tokens_per_second": 14649.728 }, { "epoch": 0.3401494298073142, "grad_norm": 0.17578125, "learning_rate": 3.8415019264716176e-05, "loss": 1.5427, "num_input_tokens_seen": 340131840, "step": 5190, "train_runtime": 23197.3426, "train_tokens_per_second": 14662.535 }, { "epoch": 0.34080482369904314, "grad_norm": 0.2314453125, "learning_rate": 3.83702118725395e-05, "loss": 1.5483, "num_input_tokens_seen": 340787200, "step": 5200, "train_runtime": 23221.8067, "train_tokens_per_second": 14675.31 }, { "epoch": 0.34146021759077205, "grad_norm": 0.1787109375, "learning_rate": 3.8325344244414146e-05, "loss": 1.5722, "num_input_tokens_seen": 341442560, "step": 5210, "train_runtime": 23246.2455, "train_tokens_per_second": 14688.073 }, { "epoch": 0.34211561148250097, "grad_norm": 0.2021484375, "learning_rate": 3.828041658247935e-05, "loss": 1.5532, "num_input_tokens_seen": 342097920, "step": 5220, "train_runtime": 23270.7997, "train_tokens_per_second": 14700.738 }, { "epoch": 0.3427710053742299, "grad_norm": 0.1904296875, "learning_rate": 3.823542908914479e-05, "loss": 1.5265, "num_input_tokens_seen": 342753280, "step": 5230, "train_runtime": 23295.2483, "train_tokens_per_second": 14713.442 }, { "epoch": 0.34342639926595886, "grad_norm": 0.228515625, "learning_rate": 3.81903819670897e-05, "loss": 1.5395, "num_input_tokens_seen": 343408640, "step": 5240, "train_runtime": 23319.7002, "train_tokens_per_second": 14726.117 }, { "epoch": 0.3440817931576878, "grad_norm": 0.212890625, "learning_rate": 3.814527541926196e-05, "loss": 1.5636, "num_input_tokens_seen": 344064000, "step": 5250, "train_runtime": 23344.1573, "train_tokens_per_second": 14738.763 }, { "epoch": 0.3447371870494167, "grad_norm": 0.203125, "learning_rate": 3.810010964887717e-05, "loss": 1.5675, "num_input_tokens_seen": 344719360, "step": 5260, "train_runtime": 23368.6057, "train_tokens_per_second": 14751.388 }, { "epoch": 0.3453925809411456, "grad_norm": 0.181640625, "learning_rate": 3.805488485941775e-05, "loss": 1.5557, "num_input_tokens_seen": 345374720, "step": 5270, "train_runtime": 23393.0427, "train_tokens_per_second": 14763.993 }, { "epoch": 0.3460479748328746, "grad_norm": 0.1884765625, "learning_rate": 3.8009601254632014e-05, "loss": 1.5353, "num_input_tokens_seen": 346030080, "step": 5280, "train_runtime": 23417.6426, "train_tokens_per_second": 14776.469 }, { "epoch": 0.3467033687246035, "grad_norm": 0.197265625, "learning_rate": 3.7964259038533256e-05, "loss": 1.5275, "num_input_tokens_seen": 346685440, "step": 5290, "train_runtime": 23442.0906, "train_tokens_per_second": 14789.015 }, { "epoch": 0.3473587626163324, "grad_norm": 0.2001953125, "learning_rate": 3.791885841539881e-05, "loss": 1.5355, "num_input_tokens_seen": 347340800, "step": 5300, "train_runtime": 23466.5745, "train_tokens_per_second": 14801.513 }, { "epoch": 0.34801415650806133, "grad_norm": 0.2119140625, "learning_rate": 3.7873399589769166e-05, "loss": 1.5463, "num_input_tokens_seen": 347996160, "step": 5310, "train_runtime": 23491.045, "train_tokens_per_second": 14813.992 }, { "epoch": 0.34866955039979025, "grad_norm": 0.1923828125, "learning_rate": 3.782788276644702e-05, "loss": 1.5377, "num_input_tokens_seen": 348651520, "step": 5320, "train_runtime": 23515.5038, "train_tokens_per_second": 14826.453 }, { "epoch": 0.3493249442915192, "grad_norm": 0.240234375, "learning_rate": 3.778230815049637e-05, "loss": 1.5291, "num_input_tokens_seen": 349306880, "step": 5330, "train_runtime": 23539.9696, "train_tokens_per_second": 14838.884 }, { "epoch": 0.34998033818324814, "grad_norm": 0.158203125, "learning_rate": 3.773667594724157e-05, "loss": 1.5339, "num_input_tokens_seen": 349962240, "step": 5340, "train_runtime": 23564.5815, "train_tokens_per_second": 14851.197 }, { "epoch": 0.35063573207497706, "grad_norm": 0.216796875, "learning_rate": 3.769098636226645e-05, "loss": 1.5417, "num_input_tokens_seen": 350617600, "step": 5350, "train_runtime": 23589.0683, "train_tokens_per_second": 14863.563 }, { "epoch": 0.351291125966706, "grad_norm": 0.1787109375, "learning_rate": 3.76452396014133e-05, "loss": 1.5456, "num_input_tokens_seen": 351272960, "step": 5360, "train_runtime": 23613.512, "train_tokens_per_second": 14875.93 }, { "epoch": 0.3519465198584349, "grad_norm": 0.1787109375, "learning_rate": 3.7599435870782064e-05, "loss": 1.5567, "num_input_tokens_seen": 351928320, "step": 5370, "train_runtime": 23637.9585, "train_tokens_per_second": 14888.27 }, { "epoch": 0.35260191375016386, "grad_norm": 0.2138671875, "learning_rate": 3.75535753767293e-05, "loss": 1.5691, "num_input_tokens_seen": 352583680, "step": 5380, "train_runtime": 23662.4028, "train_tokens_per_second": 14900.587 }, { "epoch": 0.3532573076418928, "grad_norm": 0.203125, "learning_rate": 3.7507658325867336e-05, "loss": 1.5368, "num_input_tokens_seen": 353239040, "step": 5390, "train_runtime": 23686.9968, "train_tokens_per_second": 14912.783 }, { "epoch": 0.3539127015336217, "grad_norm": 0.1953125, "learning_rate": 3.7461684925063266e-05, "loss": 1.5444, "num_input_tokens_seen": 353894400, "step": 5400, "train_runtime": 23711.4349, "train_tokens_per_second": 14925.052 }, { "epoch": 0.3545680954253506, "grad_norm": 0.1796875, "learning_rate": 3.7415655381438084e-05, "loss": 1.5431, "num_input_tokens_seen": 354549760, "step": 5410, "train_runtime": 23735.8784, "train_tokens_per_second": 14937.293 }, { "epoch": 0.3552234893170796, "grad_norm": 0.1943359375, "learning_rate": 3.736956990236571e-05, "loss": 1.5509, "num_input_tokens_seen": 355205120, "step": 5420, "train_runtime": 23760.3206, "train_tokens_per_second": 14949.509 }, { "epoch": 0.3558788832088085, "grad_norm": 0.1669921875, "learning_rate": 3.732342869547206e-05, "loss": 1.544, "num_input_tokens_seen": 355860480, "step": 5430, "train_runtime": 23784.7949, "train_tokens_per_second": 14961.68 }, { "epoch": 0.3565342771005374, "grad_norm": 0.1806640625, "learning_rate": 3.727723196863413e-05, "loss": 1.5396, "num_input_tokens_seen": 356515840, "step": 5440, "train_runtime": 23809.2598, "train_tokens_per_second": 14973.831 }, { "epoch": 0.35718967099226634, "grad_norm": 0.1728515625, "learning_rate": 3.723097992997902e-05, "loss": 1.5446, "num_input_tokens_seen": 357171200, "step": 5450, "train_runtime": 23833.8488, "train_tokens_per_second": 14985.88 }, { "epoch": 0.35784506488399526, "grad_norm": 0.203125, "learning_rate": 3.718467278788306e-05, "loss": 1.5532, "num_input_tokens_seen": 357826560, "step": 5460, "train_runtime": 23858.3208, "train_tokens_per_second": 14997.978 }, { "epoch": 0.35850045877572423, "grad_norm": 0.2060546875, "learning_rate": 3.713831075097079e-05, "loss": 1.5611, "num_input_tokens_seen": 358481920, "step": 5470, "train_runtime": 23882.7742, "train_tokens_per_second": 15010.062 }, { "epoch": 0.35915585266745315, "grad_norm": 0.1875, "learning_rate": 3.7091894028114114e-05, "loss": 1.5237, "num_input_tokens_seen": 359137280, "step": 5480, "train_runtime": 23907.2277, "train_tokens_per_second": 15022.122 }, { "epoch": 0.35981124655918206, "grad_norm": 0.1806640625, "learning_rate": 3.704542282843128e-05, "loss": 1.55, "num_input_tokens_seen": 359792640, "step": 5490, "train_runtime": 23931.6789, "train_tokens_per_second": 15034.158 }, { "epoch": 0.360466640450911, "grad_norm": 0.205078125, "learning_rate": 3.699889736128597e-05, "loss": 1.5398, "num_input_tokens_seen": 360448000, "step": 5500, "train_runtime": 23956.278, "train_tokens_per_second": 15046.077 }, { "epoch": 0.36112203434263995, "grad_norm": 0.1884765625, "learning_rate": 3.6952317836286344e-05, "loss": 1.5524, "num_input_tokens_seen": 361103360, "step": 5510, "train_runtime": 23980.744, "train_tokens_per_second": 15058.055 }, { "epoch": 0.36177742823436887, "grad_norm": 0.1875, "learning_rate": 3.6905684463284126e-05, "loss": 1.5454, "num_input_tokens_seen": 361758720, "step": 5520, "train_runtime": 24005.2156, "train_tokens_per_second": 15070.005 }, { "epoch": 0.3624328221260978, "grad_norm": 0.1796875, "learning_rate": 3.6858997452373625e-05, "loss": 1.5432, "num_input_tokens_seen": 362414080, "step": 5530, "train_runtime": 24030.8851, "train_tokens_per_second": 15081.179 }, { "epoch": 0.3630882160178267, "grad_norm": 0.1787109375, "learning_rate": 3.6812257013890794e-05, "loss": 1.5585, "num_input_tokens_seen": 363069440, "step": 5540, "train_runtime": 24055.3429, "train_tokens_per_second": 15093.089 }, { "epoch": 0.3637436099095556, "grad_norm": 0.203125, "learning_rate": 3.676546335841232e-05, "loss": 1.5533, "num_input_tokens_seen": 363724800, "step": 5550, "train_runtime": 24079.7929, "train_tokens_per_second": 15104.98 }, { "epoch": 0.3643990038012846, "grad_norm": 0.2060546875, "learning_rate": 3.671861669675461e-05, "loss": 1.5429, "num_input_tokens_seen": 364380160, "step": 5560, "train_runtime": 24104.367, "train_tokens_per_second": 15116.769 }, { "epoch": 0.3650543976930135, "grad_norm": 0.1845703125, "learning_rate": 3.6671717239972894e-05, "loss": 1.5421, "num_input_tokens_seen": 365035520, "step": 5570, "train_runtime": 24128.8313, "train_tokens_per_second": 15128.603 }, { "epoch": 0.3657097915847424, "grad_norm": 0.1728515625, "learning_rate": 3.662476519936026e-05, "loss": 1.5226, "num_input_tokens_seen": 365690880, "step": 5580, "train_runtime": 24153.2848, "train_tokens_per_second": 15140.42 }, { "epoch": 0.36636518547647134, "grad_norm": 0.1962890625, "learning_rate": 3.6577760786446684e-05, "loss": 1.5224, "num_input_tokens_seen": 366346240, "step": 5590, "train_runtime": 24177.7355, "train_tokens_per_second": 15152.215 }, { "epoch": 0.36702057936820026, "grad_norm": 0.1953125, "learning_rate": 3.6530704212998114e-05, "loss": 1.5252, "num_input_tokens_seen": 367001600, "step": 5600, "train_runtime": 24202.187, "train_tokens_per_second": 15163.985 }, { "epoch": 0.36767597325992923, "grad_norm": 0.220703125, "learning_rate": 3.648359569101546e-05, "loss": 1.5558, "num_input_tokens_seen": 367656960, "step": 5610, "train_runtime": 24226.6208, "train_tokens_per_second": 15175.743 }, { "epoch": 0.36833136715165815, "grad_norm": 0.259765625, "learning_rate": 3.643643543273371e-05, "loss": 1.5485, "num_input_tokens_seen": 368312320, "step": 5620, "train_runtime": 24251.1622, "train_tokens_per_second": 15187.409 }, { "epoch": 0.36898676104338707, "grad_norm": 0.1669921875, "learning_rate": 3.63892236506209e-05, "loss": 1.5602, "num_input_tokens_seen": 368967680, "step": 5630, "train_runtime": 24275.5891, "train_tokens_per_second": 15199.124 }, { "epoch": 0.369642154935116, "grad_norm": 0.1865234375, "learning_rate": 3.6341960557377226e-05, "loss": 1.5492, "num_input_tokens_seen": 369623040, "step": 5640, "train_runtime": 24300.0262, "train_tokens_per_second": 15210.808 }, { "epoch": 0.37029754882684496, "grad_norm": 0.166015625, "learning_rate": 3.6294646365934024e-05, "loss": 1.5379, "num_input_tokens_seen": 370278400, "step": 5650, "train_runtime": 24324.4612, "train_tokens_per_second": 15222.471 }, { "epoch": 0.3709529427185739, "grad_norm": 0.2314453125, "learning_rate": 3.624728128945287e-05, "loss": 1.5398, "num_input_tokens_seen": 370933760, "step": 5660, "train_runtime": 24348.9077, "train_tokens_per_second": 15234.103 }, { "epoch": 0.3716083366103028, "grad_norm": 0.1943359375, "learning_rate": 3.619986554132456e-05, "loss": 1.5396, "num_input_tokens_seen": 371589120, "step": 5670, "train_runtime": 24378.9369, "train_tokens_per_second": 15242.22 }, { "epoch": 0.3722637305020317, "grad_norm": 0.193359375, "learning_rate": 3.615239933516819e-05, "loss": 1.5346, "num_input_tokens_seen": 372244480, "step": 5680, "train_runtime": 24403.4235, "train_tokens_per_second": 15253.781 }, { "epoch": 0.3729191243937606, "grad_norm": 0.224609375, "learning_rate": 3.6104882884830184e-05, "loss": 1.5439, "num_input_tokens_seen": 372899840, "step": 5690, "train_runtime": 24427.8675, "train_tokens_per_second": 15265.346 }, { "epoch": 0.3735745182854896, "grad_norm": 0.1669921875, "learning_rate": 3.605731640438332e-05, "loss": 1.5567, "num_input_tokens_seen": 373555200, "step": 5700, "train_runtime": 24452.3368, "train_tokens_per_second": 15276.871 }, { "epoch": 0.3742299121772185, "grad_norm": 0.23046875, "learning_rate": 3.600970010812579e-05, "loss": 1.5217, "num_input_tokens_seen": 374210560, "step": 5710, "train_runtime": 24476.7953, "train_tokens_per_second": 15288.38 }, { "epoch": 0.37488530606894743, "grad_norm": 0.1865234375, "learning_rate": 3.5962034210580196e-05, "loss": 1.5482, "num_input_tokens_seen": 374865920, "step": 5720, "train_runtime": 24501.2437, "train_tokens_per_second": 15299.873 }, { "epoch": 0.37554069996067635, "grad_norm": 0.189453125, "learning_rate": 3.591431892649262e-05, "loss": 1.5422, "num_input_tokens_seen": 375521280, "step": 5730, "train_runtime": 24525.8758, "train_tokens_per_second": 15311.228 }, { "epoch": 0.3761960938524053, "grad_norm": 0.203125, "learning_rate": 3.586655447083164e-05, "loss": 1.5392, "num_input_tokens_seen": 376176640, "step": 5740, "train_runtime": 24550.3222, "train_tokens_per_second": 15322.676 }, { "epoch": 0.37685148774413424, "grad_norm": 0.255859375, "learning_rate": 3.581874105878735e-05, "loss": 1.5496, "num_input_tokens_seen": 376832000, "step": 5750, "train_runtime": 24574.7832, "train_tokens_per_second": 15334.093 }, { "epoch": 0.37750688163586316, "grad_norm": 0.1787109375, "learning_rate": 3.577087890577042e-05, "loss": 1.5356, "num_input_tokens_seen": 377487360, "step": 5760, "train_runtime": 24599.2322, "train_tokens_per_second": 15345.494 }, { "epoch": 0.3781622755275921, "grad_norm": 0.1875, "learning_rate": 3.572296822741112e-05, "loss": 1.5581, "num_input_tokens_seen": 378142720, "step": 5770, "train_runtime": 24623.6925, "train_tokens_per_second": 15356.865 }, { "epoch": 0.378817669419321, "grad_norm": 0.21875, "learning_rate": 3.5675009239558296e-05, "loss": 1.5576, "num_input_tokens_seen": 378798080, "step": 5780, "train_runtime": 24648.2954, "train_tokens_per_second": 15368.125 }, { "epoch": 0.37947306331104996, "grad_norm": 0.1728515625, "learning_rate": 3.562700215827849e-05, "loss": 1.5537, "num_input_tokens_seen": 379453440, "step": 5790, "train_runtime": 24672.7398, "train_tokens_per_second": 15379.461 }, { "epoch": 0.3801284572027789, "grad_norm": 0.2119140625, "learning_rate": 3.557894719985488e-05, "loss": 1.5458, "num_input_tokens_seen": 380108800, "step": 5800, "train_runtime": 24697.2284, "train_tokens_per_second": 15390.747 }, { "epoch": 0.3807838510945078, "grad_norm": 0.21875, "learning_rate": 3.553084458078636e-05, "loss": 1.5302, "num_input_tokens_seen": 380764160, "step": 5810, "train_runtime": 24721.6862, "train_tokens_per_second": 15402.03 }, { "epoch": 0.3814392449862367, "grad_norm": 0.1748046875, "learning_rate": 3.548269451778653e-05, "loss": 1.5616, "num_input_tokens_seen": 381419520, "step": 5820, "train_runtime": 24746.1476, "train_tokens_per_second": 15413.289 }, { "epoch": 0.38209463887796563, "grad_norm": 0.197265625, "learning_rate": 3.5434497227782765e-05, "loss": 1.5434, "num_input_tokens_seen": 382074880, "step": 5830, "train_runtime": 24770.6122, "train_tokens_per_second": 15424.523 }, { "epoch": 0.3827500327696946, "grad_norm": 0.1875, "learning_rate": 3.538625292791519e-05, "loss": 1.5301, "num_input_tokens_seen": 382730240, "step": 5840, "train_runtime": 24795.2065, "train_tokens_per_second": 15435.654 }, { "epoch": 0.3834054266614235, "grad_norm": 0.2099609375, "learning_rate": 3.5337961835535694e-05, "loss": 1.5437, "num_input_tokens_seen": 383385600, "step": 5850, "train_runtime": 24819.6955, "train_tokens_per_second": 15446.829 }, { "epoch": 0.38406082055315244, "grad_norm": 0.185546875, "learning_rate": 3.528962416820703e-05, "loss": 1.5478, "num_input_tokens_seen": 384040960, "step": 5860, "train_runtime": 24844.1645, "train_tokens_per_second": 15457.995 }, { "epoch": 0.38471621444488135, "grad_norm": 0.1982421875, "learning_rate": 3.524124014370175e-05, "loss": 1.5492, "num_input_tokens_seen": 384696320, "step": 5870, "train_runtime": 24868.6316, "train_tokens_per_second": 15469.139 }, { "epoch": 0.3853716083366103, "grad_norm": 0.171875, "learning_rate": 3.519280998000126e-05, "loss": 1.5298, "num_input_tokens_seen": 385351680, "step": 5880, "train_runtime": 24893.1011, "train_tokens_per_second": 15480.26 }, { "epoch": 0.38602700222833924, "grad_norm": 0.1826171875, "learning_rate": 3.5144333895294843e-05, "loss": 1.5389, "num_input_tokens_seen": 386007040, "step": 5890, "train_runtime": 24917.5534, "train_tokens_per_second": 15491.37 }, { "epoch": 0.38668239612006816, "grad_norm": 0.189453125, "learning_rate": 3.509581210797865e-05, "loss": 1.5612, "num_input_tokens_seen": 386662400, "step": 5900, "train_runtime": 24942.1373, "train_tokens_per_second": 15502.376 }, { "epoch": 0.3873377900117971, "grad_norm": 0.1728515625, "learning_rate": 3.504724483665475e-05, "loss": 1.5646, "num_input_tokens_seen": 387317760, "step": 5910, "train_runtime": 24966.5918, "train_tokens_per_second": 15513.441 }, { "epoch": 0.387993183903526, "grad_norm": 0.1904296875, "learning_rate": 3.499863230013012e-05, "loss": 1.5918, "num_input_tokens_seen": 387973120, "step": 5920, "train_runtime": 24991.0301, "train_tokens_per_second": 15524.495 }, { "epoch": 0.38864857779525497, "grad_norm": 0.1767578125, "learning_rate": 3.494997471741568e-05, "loss": 1.5325, "num_input_tokens_seen": 388628480, "step": 5930, "train_runtime": 25015.4992, "train_tokens_per_second": 15535.508 }, { "epoch": 0.3893039716869839, "grad_norm": 0.1875, "learning_rate": 3.490127230772526e-05, "loss": 1.5647, "num_input_tokens_seen": 389283840, "step": 5940, "train_runtime": 25039.947, "train_tokens_per_second": 15546.512 }, { "epoch": 0.3899593655787128, "grad_norm": 0.1630859375, "learning_rate": 3.485252529047468e-05, "loss": 1.5476, "num_input_tokens_seen": 389939200, "step": 5950, "train_runtime": 25064.417, "train_tokens_per_second": 15557.481 }, { "epoch": 0.3906147594704417, "grad_norm": 0.1884765625, "learning_rate": 3.480373388528073e-05, "loss": 1.5434, "num_input_tokens_seen": 390594560, "step": 5960, "train_runtime": 25089.0061, "train_tokens_per_second": 15568.355 }, { "epoch": 0.3912701533621707, "grad_norm": 0.177734375, "learning_rate": 3.475489831196015e-05, "loss": 1.5588, "num_input_tokens_seen": 391249920, "step": 5970, "train_runtime": 25113.4697, "train_tokens_per_second": 15579.286 }, { "epoch": 0.3919255472538996, "grad_norm": 0.1884765625, "learning_rate": 3.470601879052867e-05, "loss": 1.5548, "num_input_tokens_seen": 391905280, "step": 5980, "train_runtime": 25137.9112, "train_tokens_per_second": 15590.209 }, { "epoch": 0.3925809411456285, "grad_norm": 0.19921875, "learning_rate": 3.465709554120005e-05, "loss": 1.5628, "num_input_tokens_seen": 392560640, "step": 5990, "train_runtime": 25162.3683, "train_tokens_per_second": 15601.101 }, { "epoch": 0.39323633503735744, "grad_norm": 0.1787109375, "learning_rate": 3.460812878438503e-05, "loss": 1.5522, "num_input_tokens_seen": 393216000, "step": 6000, "train_runtime": 25186.8163, "train_tokens_per_second": 15611.977 }, { "epoch": 0.39389172892908636, "grad_norm": 0.18359375, "learning_rate": 3.4559118740690355e-05, "loss": 1.5418, "num_input_tokens_seen": 393871360, "step": 6010, "train_runtime": 25211.3912, "train_tokens_per_second": 15622.754 }, { "epoch": 0.39454712282081533, "grad_norm": 0.2109375, "learning_rate": 3.4510065630917794e-05, "loss": 1.5633, "num_input_tokens_seen": 394526720, "step": 6020, "train_runtime": 25235.8282, "train_tokens_per_second": 15633.595 }, { "epoch": 0.39520251671254425, "grad_norm": 0.22265625, "learning_rate": 3.4460969676063165e-05, "loss": 1.5477, "num_input_tokens_seen": 395182080, "step": 6030, "train_runtime": 25260.2953, "train_tokens_per_second": 15644.397 }, { "epoch": 0.39585791060427317, "grad_norm": 0.1923828125, "learning_rate": 3.441183109731527e-05, "loss": 1.5305, "num_input_tokens_seen": 395837440, "step": 6040, "train_runtime": 25284.7523, "train_tokens_per_second": 15655.184 }, { "epoch": 0.3965133044960021, "grad_norm": 0.1943359375, "learning_rate": 3.4362650116054964e-05, "loss": 1.5486, "num_input_tokens_seen": 396492800, "step": 6050, "train_runtime": 25309.2279, "train_tokens_per_second": 15665.938 }, { "epoch": 0.397168698387731, "grad_norm": 0.18359375, "learning_rate": 3.4313426953854136e-05, "loss": 1.5389, "num_input_tokens_seen": 397148160, "step": 6060, "train_runtime": 25333.6783, "train_tokens_per_second": 15676.688 }, { "epoch": 0.39782409227946, "grad_norm": 0.1728515625, "learning_rate": 3.426416183247471e-05, "loss": 1.5689, "num_input_tokens_seen": 397803520, "step": 6070, "train_runtime": 25358.2379, "train_tokens_per_second": 15687.349 }, { "epoch": 0.3984794861711889, "grad_norm": 0.2177734375, "learning_rate": 3.421485497386764e-05, "loss": 1.5553, "num_input_tokens_seen": 398458880, "step": 6080, "train_runtime": 25382.6849, "train_tokens_per_second": 15698.059 }, { "epoch": 0.3991348800629178, "grad_norm": 0.1748046875, "learning_rate": 3.4165506600171923e-05, "loss": 1.5694, "num_input_tokens_seen": 399114240, "step": 6090, "train_runtime": 25407.1227, "train_tokens_per_second": 15708.754 }, { "epoch": 0.3997902739546467, "grad_norm": 0.1943359375, "learning_rate": 3.4116116933713583e-05, "loss": 1.5529, "num_input_tokens_seen": 399769600, "step": 6100, "train_runtime": 25431.5759, "train_tokens_per_second": 15719.419 } ], "logging_steps": 10, "max_steps": 15258, "num_input_tokens_seen": 399769600, "num_train_epochs": 9223372036854775807, "save_steps": 1525, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.793083073069056e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }