|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 6644, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0015051173991571343, |
|
"grad_norm": 1.8453121109105117, |
|
"learning_rate": 1.9972907886815173e-05, |
|
"loss": 1.1924, |
|
"mean_token_accuracy": 0.7275555655360222, |
|
"num_tokens": 655360.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0030102347983142685, |
|
"grad_norm": 1.4854652689978918, |
|
"learning_rate": 1.9942805538832032e-05, |
|
"loss": 1.093, |
|
"mean_token_accuracy": 0.7368938684463501, |
|
"num_tokens": 1310497.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004515352197471403, |
|
"grad_norm": 1.457367925139778, |
|
"learning_rate": 1.9912703190848888e-05, |
|
"loss": 0.9763, |
|
"mean_token_accuracy": 0.757565951347351, |
|
"num_tokens": 1965857.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006020469596628537, |
|
"grad_norm": 1.336380809253597, |
|
"learning_rate": 1.9882600842865743e-05, |
|
"loss": 0.9562, |
|
"mean_token_accuracy": 0.7597963467240334, |
|
"num_tokens": 2621217.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.007525586995785672, |
|
"grad_norm": 1.271042190976678, |
|
"learning_rate": 1.9852498494882602e-05, |
|
"loss": 0.9111, |
|
"mean_token_accuracy": 0.7664208248257637, |
|
"num_tokens": 3275522.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009030704394942806, |
|
"grad_norm": 1.601127132462518, |
|
"learning_rate": 1.982239614689946e-05, |
|
"loss": 0.9061, |
|
"mean_token_accuracy": 0.7658738359808922, |
|
"num_tokens": 3930882.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01053582179409994, |
|
"grad_norm": 1.2565022237536327, |
|
"learning_rate": 1.9792293798916317e-05, |
|
"loss": 0.9142, |
|
"mean_token_accuracy": 0.7641694605350494, |
|
"num_tokens": 4583016.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.012040939193257074, |
|
"grad_norm": 1.151400416011175, |
|
"learning_rate": 1.9762191450933176e-05, |
|
"loss": 0.8912, |
|
"mean_token_accuracy": 0.7694499135017395, |
|
"num_tokens": 5238022.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.013546056592414209, |
|
"grad_norm": 1.2919672366499955, |
|
"learning_rate": 1.973208910295003e-05, |
|
"loss": 0.8648, |
|
"mean_token_accuracy": 0.7729788482189178, |
|
"num_tokens": 5891895.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.015051173991571343, |
|
"grad_norm": 1.1577649641155117, |
|
"learning_rate": 1.970198675496689e-05, |
|
"loss": 0.8343, |
|
"mean_token_accuracy": 0.7795944586396217, |
|
"num_tokens": 6544042.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016556291390728478, |
|
"grad_norm": 1.1650145577708801, |
|
"learning_rate": 1.9671884406983746e-05, |
|
"loss": 0.8326, |
|
"mean_token_accuracy": 0.7799147427082062, |
|
"num_tokens": 7197586.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.018061408789885613, |
|
"grad_norm": 1.042071712058231, |
|
"learning_rate": 1.9641782059000605e-05, |
|
"loss": 0.8124, |
|
"mean_token_accuracy": 0.783945745229721, |
|
"num_tokens": 7852585.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.019566526189042744, |
|
"grad_norm": 1.0693315253462947, |
|
"learning_rate": 1.961167971101746e-05, |
|
"loss": 0.8597, |
|
"mean_token_accuracy": 0.7736718013882637, |
|
"num_tokens": 8504289.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02107164358819988, |
|
"grad_norm": 1.0566333790719504, |
|
"learning_rate": 1.958157736303432e-05, |
|
"loss": 0.8939, |
|
"mean_token_accuracy": 0.7663755238056182, |
|
"num_tokens": 9159625.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.022576760987357013, |
|
"grad_norm": 1.1053130861967317, |
|
"learning_rate": 1.9551475015051175e-05, |
|
"loss": 0.8191, |
|
"mean_token_accuracy": 0.7820270523428917, |
|
"num_tokens": 9814985.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.024081878386514148, |
|
"grad_norm": 1.1367777908835162, |
|
"learning_rate": 1.9521372667068034e-05, |
|
"loss": 0.8075, |
|
"mean_token_accuracy": 0.7839680761098862, |
|
"num_tokens": 10468781.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.025586995785671283, |
|
"grad_norm": 1.1232727757104992, |
|
"learning_rate": 1.949127031908489e-05, |
|
"loss": 0.8044, |
|
"mean_token_accuracy": 0.7850238159298897, |
|
"num_tokens": 11124141.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.027092113184828417, |
|
"grad_norm": 1.0465176815873063, |
|
"learning_rate": 1.9461167971101745e-05, |
|
"loss": 0.8152, |
|
"mean_token_accuracy": 0.783750994503498, |
|
"num_tokens": 11776430.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.028597230583985552, |
|
"grad_norm": 1.2945946368996537, |
|
"learning_rate": 1.9431065623118607e-05, |
|
"loss": 0.8485, |
|
"mean_token_accuracy": 0.7770387619733811, |
|
"num_tokens": 12427616.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.030102347983142687, |
|
"grad_norm": 0.9842224206674687, |
|
"learning_rate": 1.9400963275135463e-05, |
|
"loss": 0.8264, |
|
"mean_token_accuracy": 0.779655097424984, |
|
"num_tokens": 13079739.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03160746538229982, |
|
"grad_norm": 1.0666414615153936, |
|
"learning_rate": 1.9370860927152318e-05, |
|
"loss": 0.8162, |
|
"mean_token_accuracy": 0.782223354279995, |
|
"num_tokens": 13734917.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.033112582781456956, |
|
"grad_norm": 0.9945764345827092, |
|
"learning_rate": 1.9340758579169177e-05, |
|
"loss": 0.8704, |
|
"mean_token_accuracy": 0.7713055685162544, |
|
"num_tokens": 14390277.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03461770018061409, |
|
"grad_norm": 1.1032502230662455, |
|
"learning_rate": 1.9310656231186033e-05, |
|
"loss": 0.8162, |
|
"mean_token_accuracy": 0.7820397764444351, |
|
"num_tokens": 15044520.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.036122817579771226, |
|
"grad_norm": 0.9162140424517378, |
|
"learning_rate": 1.9280553883202892e-05, |
|
"loss": 0.7813, |
|
"mean_token_accuracy": 0.7926082789897919, |
|
"num_tokens": 15698080.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03762793497892836, |
|
"grad_norm": 0.9769358855408936, |
|
"learning_rate": 1.925045153521975e-05, |
|
"loss": 0.8643, |
|
"mean_token_accuracy": 0.7731036424636841, |
|
"num_tokens": 16349750.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03913305237808549, |
|
"grad_norm": 1.0281959693887788, |
|
"learning_rate": 1.9220349187236606e-05, |
|
"loss": 0.807, |
|
"mean_token_accuracy": 0.785991695523262, |
|
"num_tokens": 17005110.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.040638169777242626, |
|
"grad_norm": 1.1457279024679698, |
|
"learning_rate": 1.9190246839253465e-05, |
|
"loss": 0.8576, |
|
"mean_token_accuracy": 0.7716442331671715, |
|
"num_tokens": 17658497.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04214328717639976, |
|
"grad_norm": 0.9884584269395141, |
|
"learning_rate": 1.916014449127032e-05, |
|
"loss": 0.853, |
|
"mean_token_accuracy": 0.7742137908935547, |
|
"num_tokens": 18312553.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.043648404575556896, |
|
"grad_norm": 1.0510970671462876, |
|
"learning_rate": 1.9130042143287176e-05, |
|
"loss": 0.8107, |
|
"mean_token_accuracy": 0.7836480632424354, |
|
"num_tokens": 18967136.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.04515352197471403, |
|
"grad_norm": 0.9766990463983991, |
|
"learning_rate": 1.9099939795304035e-05, |
|
"loss": 0.7804, |
|
"mean_token_accuracy": 0.7905746832489967, |
|
"num_tokens": 19620255.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.046658639373871165, |
|
"grad_norm": 0.9709386691440339, |
|
"learning_rate": 1.9069837447320894e-05, |
|
"loss": 0.8228, |
|
"mean_token_accuracy": 0.7801810503005981, |
|
"num_tokens": 20274486.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.048163756773028296, |
|
"grad_norm": 0.9797976606519577, |
|
"learning_rate": 1.903973509933775e-05, |
|
"loss": 0.7985, |
|
"mean_token_accuracy": 0.7887556836009025, |
|
"num_tokens": 20926124.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04966887417218543, |
|
"grad_norm": 0.8691866193641657, |
|
"learning_rate": 1.900963275135461e-05, |
|
"loss": 0.7733, |
|
"mean_token_accuracy": 0.7907425567507744, |
|
"num_tokens": 21581484.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.051173991571342566, |
|
"grad_norm": 0.8353763531765348, |
|
"learning_rate": 1.8979530403371464e-05, |
|
"loss": 0.8049, |
|
"mean_token_accuracy": 0.7866038784384728, |
|
"num_tokens": 22236844.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0526791089704997, |
|
"grad_norm": 1.047787346678463, |
|
"learning_rate": 1.894942805538832e-05, |
|
"loss": 0.801, |
|
"mean_token_accuracy": 0.7864298403263092, |
|
"num_tokens": 22892204.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.054184226369656835, |
|
"grad_norm": 0.9624940693468708, |
|
"learning_rate": 1.891932570740518e-05, |
|
"loss": 0.7777, |
|
"mean_token_accuracy": 0.7911544889211655, |
|
"num_tokens": 23546108.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.055689343768813966, |
|
"grad_norm": 0.9304563256474666, |
|
"learning_rate": 1.8889223359422038e-05, |
|
"loss": 0.7793, |
|
"mean_token_accuracy": 0.7903302207589149, |
|
"num_tokens": 24198050.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.057194461167971104, |
|
"grad_norm": 0.9586403555276852, |
|
"learning_rate": 1.8859121011438893e-05, |
|
"loss": 0.8184, |
|
"mean_token_accuracy": 0.7825526088476181, |
|
"num_tokens": 24850653.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.058699578567128236, |
|
"grad_norm": 0.9862346398087346, |
|
"learning_rate": 1.8829018663455752e-05, |
|
"loss": 0.8419, |
|
"mean_token_accuracy": 0.7795390293002129, |
|
"num_tokens": 25502964.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.060204695966285374, |
|
"grad_norm": 0.9988458761226531, |
|
"learning_rate": 1.8798916315472608e-05, |
|
"loss": 0.8113, |
|
"mean_token_accuracy": 0.7838620856404305, |
|
"num_tokens": 26157878.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.061709813365442505, |
|
"grad_norm": 0.9487172389927636, |
|
"learning_rate": 1.8768813967489467e-05, |
|
"loss": 0.7814, |
|
"mean_token_accuracy": 0.789660955965519, |
|
"num_tokens": 26811933.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.06321493076459964, |
|
"grad_norm": 0.9851037653184612, |
|
"learning_rate": 1.8738711619506322e-05, |
|
"loss": 0.8332, |
|
"mean_token_accuracy": 0.7792101621627807, |
|
"num_tokens": 27465720.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.06472004816375677, |
|
"grad_norm": 1.0042540429584381, |
|
"learning_rate": 1.8708609271523178e-05, |
|
"loss": 0.7903, |
|
"mean_token_accuracy": 0.7871855169534683, |
|
"num_tokens": 28121080.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.06622516556291391, |
|
"grad_norm": 0.9927572758373544, |
|
"learning_rate": 1.867850692354004e-05, |
|
"loss": 0.7727, |
|
"mean_token_accuracy": 0.7941063031554222, |
|
"num_tokens": 28774438.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06773028296207104, |
|
"grad_norm": 0.8834285403649499, |
|
"learning_rate": 1.8648404575556896e-05, |
|
"loss": 0.8085, |
|
"mean_token_accuracy": 0.7830948889255523, |
|
"num_tokens": 29428023.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.06923540036122817, |
|
"grad_norm": 0.9113434494758214, |
|
"learning_rate": 1.861830222757375e-05, |
|
"loss": 0.7767, |
|
"mean_token_accuracy": 0.7919638514518738, |
|
"num_tokens": 30083383.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07074051776038531, |
|
"grad_norm": 0.9341036882318858, |
|
"learning_rate": 1.858819987959061e-05, |
|
"loss": 0.7916, |
|
"mean_token_accuracy": 0.7892049625515938, |
|
"num_tokens": 30737589.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.07224563515954245, |
|
"grad_norm": 1.4816757132686853, |
|
"learning_rate": 1.8558097531607466e-05, |
|
"loss": 0.8253, |
|
"mean_token_accuracy": 0.7808192431926727, |
|
"num_tokens": 31391356.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.07375075255869958, |
|
"grad_norm": 0.9551440074457215, |
|
"learning_rate": 1.8527995183624325e-05, |
|
"loss": 0.7825, |
|
"mean_token_accuracy": 0.7899382427334786, |
|
"num_tokens": 32045281.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.07525586995785671, |
|
"grad_norm": 0.957693500309248, |
|
"learning_rate": 1.8497892835641184e-05, |
|
"loss": 0.8357, |
|
"mean_token_accuracy": 0.7793286591768265, |
|
"num_tokens": 32699618.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07676098735701385, |
|
"grad_norm": 1.0341152031457617, |
|
"learning_rate": 1.846779048765804e-05, |
|
"loss": 0.7896, |
|
"mean_token_accuracy": 0.7893010467290879, |
|
"num_tokens": 33351567.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.07826610475617098, |
|
"grad_norm": 0.9549231501402184, |
|
"learning_rate": 1.8437688139674898e-05, |
|
"loss": 0.8599, |
|
"mean_token_accuracy": 0.7733965054154396, |
|
"num_tokens": 34003033.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.07977122215532811, |
|
"grad_norm": 0.9923234542168496, |
|
"learning_rate": 1.8407585791691754e-05, |
|
"loss": 0.8205, |
|
"mean_token_accuracy": 0.7820205718278885, |
|
"num_tokens": 34653481.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.08127633955448525, |
|
"grad_norm": 0.9524724869535042, |
|
"learning_rate": 1.837748344370861e-05, |
|
"loss": 0.8063, |
|
"mean_token_accuracy": 0.7859901934862137, |
|
"num_tokens": 35306197.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.08278145695364239, |
|
"grad_norm": 0.9976717179204758, |
|
"learning_rate": 1.8347381095725468e-05, |
|
"loss": 0.8214, |
|
"mean_token_accuracy": 0.7831726789474487, |
|
"num_tokens": 35957015.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.08428657435279951, |
|
"grad_norm": 0.9085860126433437, |
|
"learning_rate": 1.8317278747742327e-05, |
|
"loss": 0.7929, |
|
"mean_token_accuracy": 0.7867233589291572, |
|
"num_tokens": 36611115.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.08579169175195665, |
|
"grad_norm": 0.9266872864888698, |
|
"learning_rate": 1.8287176399759183e-05, |
|
"loss": 0.7896, |
|
"mean_token_accuracy": 0.7881813287734986, |
|
"num_tokens": 37266182.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.08729680915111379, |
|
"grad_norm": 0.8724843087705262, |
|
"learning_rate": 1.825707405177604e-05, |
|
"loss": 0.7713, |
|
"mean_token_accuracy": 0.7928503587841987, |
|
"num_tokens": 37920028.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.08880192655027092, |
|
"grad_norm": 0.9053691052273664, |
|
"learning_rate": 1.8226971703792897e-05, |
|
"loss": 0.7898, |
|
"mean_token_accuracy": 0.789909017086029, |
|
"num_tokens": 38575388.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.09030704394942805, |
|
"grad_norm": 0.8479670947647756, |
|
"learning_rate": 1.8196869355809753e-05, |
|
"loss": 0.7627, |
|
"mean_token_accuracy": 0.7941003635525703, |
|
"num_tokens": 39228780.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.09181216134858519, |
|
"grad_norm": 0.8878872189126786, |
|
"learning_rate": 1.816676700782661e-05, |
|
"loss": 0.7863, |
|
"mean_token_accuracy": 0.7889234691858291, |
|
"num_tokens": 39882466.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.09331727874774233, |
|
"grad_norm": 0.9378936695301893, |
|
"learning_rate": 1.813666465984347e-05, |
|
"loss": 0.8074, |
|
"mean_token_accuracy": 0.7841841742396355, |
|
"num_tokens": 40537826.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.09482239614689945, |
|
"grad_norm": 0.8704173755978687, |
|
"learning_rate": 1.8106562311860326e-05, |
|
"loss": 0.8057, |
|
"mean_token_accuracy": 0.7853535667061806, |
|
"num_tokens": 41193186.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.09632751354605659, |
|
"grad_norm": 0.980576606661472, |
|
"learning_rate": 1.8076459963877185e-05, |
|
"loss": 0.8312, |
|
"mean_token_accuracy": 0.781511053442955, |
|
"num_tokens": 41848546.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.09783263094521373, |
|
"grad_norm": 1.054602428103673, |
|
"learning_rate": 1.804635761589404e-05, |
|
"loss": 0.802, |
|
"mean_token_accuracy": 0.7860093146562577, |
|
"num_tokens": 42503565.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.09933774834437085, |
|
"grad_norm": 0.9031118355106363, |
|
"learning_rate": 1.80162552679109e-05, |
|
"loss": 0.771, |
|
"mean_token_accuracy": 0.7914280131459236, |
|
"num_tokens": 43158925.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.10084286574352799, |
|
"grad_norm": 0.8409564496447067, |
|
"learning_rate": 1.7986152919927755e-05, |
|
"loss": 0.8108, |
|
"mean_token_accuracy": 0.7854543194174767, |
|
"num_tokens": 43814285.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.10234798314268513, |
|
"grad_norm": 0.8977180215627701, |
|
"learning_rate": 1.795605057194461e-05, |
|
"loss": 0.7866, |
|
"mean_token_accuracy": 0.790862138569355, |
|
"num_tokens": 44468193.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.10385310054184227, |
|
"grad_norm": 0.9113767976777215, |
|
"learning_rate": 1.7925948223961473e-05, |
|
"loss": 0.7674, |
|
"mean_token_accuracy": 0.7942569464445114, |
|
"num_tokens": 45119814.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.1053582179409994, |
|
"grad_norm": 0.9283532440136132, |
|
"learning_rate": 1.789584587597833e-05, |
|
"loss": 0.788, |
|
"mean_token_accuracy": 0.7885141059756279, |
|
"num_tokens": 45773988.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.10686333534015653, |
|
"grad_norm": 0.8961049192838797, |
|
"learning_rate": 1.7865743527995184e-05, |
|
"loss": 0.7746, |
|
"mean_token_accuracy": 0.7916218876838684, |
|
"num_tokens": 46429348.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.10836845273931367, |
|
"grad_norm": 0.8940233613616361, |
|
"learning_rate": 1.7835641180012043e-05, |
|
"loss": 0.7571, |
|
"mean_token_accuracy": 0.7955988764762878, |
|
"num_tokens": 47082837.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1098735701384708, |
|
"grad_norm": 0.9041335332517931, |
|
"learning_rate": 1.78055388320289e-05, |
|
"loss": 0.7784, |
|
"mean_token_accuracy": 0.7905792102217675, |
|
"num_tokens": 47738197.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.11137868753762793, |
|
"grad_norm": 0.8763038167699705, |
|
"learning_rate": 1.7775436484045757e-05, |
|
"loss": 0.8217, |
|
"mean_token_accuracy": 0.7831670209765434, |
|
"num_tokens": 48391556.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.11288380493678507, |
|
"grad_norm": 0.9585333018523788, |
|
"learning_rate": 1.7745334136062616e-05, |
|
"loss": 0.8045, |
|
"mean_token_accuracy": 0.7855499908328056, |
|
"num_tokens": 49046501.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.11438892233594221, |
|
"grad_norm": 0.9581547193027541, |
|
"learning_rate": 1.7715231788079472e-05, |
|
"loss": 0.7987, |
|
"mean_token_accuracy": 0.7847492977976799, |
|
"num_tokens": 49700398.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.11589403973509933, |
|
"grad_norm": 0.876463264934873, |
|
"learning_rate": 1.7685129440096327e-05, |
|
"loss": 0.7759, |
|
"mean_token_accuracy": 0.7922845646739006, |
|
"num_tokens": 50353303.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.11739915713425647, |
|
"grad_norm": 0.7912414198907308, |
|
"learning_rate": 1.7655027092113186e-05, |
|
"loss": 0.7635, |
|
"mean_token_accuracy": 0.7954950258135796, |
|
"num_tokens": 51004008.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.11890427453341361, |
|
"grad_norm": 0.8567434891484385, |
|
"learning_rate": 1.7624924744130042e-05, |
|
"loss": 0.786, |
|
"mean_token_accuracy": 0.7885375574231148, |
|
"num_tokens": 51657183.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.12040939193257075, |
|
"grad_norm": 0.8674077034489452, |
|
"learning_rate": 1.75948223961469e-05, |
|
"loss": 0.7491, |
|
"mean_token_accuracy": 0.7960452109575271, |
|
"num_tokens": 52306165.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.12191450933172787, |
|
"grad_norm": 0.9532040218592741, |
|
"learning_rate": 1.756472004816376e-05, |
|
"loss": 0.8002, |
|
"mean_token_accuracy": 0.7860918015241622, |
|
"num_tokens": 52961289.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.12341962673088501, |
|
"grad_norm": 0.9311440870377063, |
|
"learning_rate": 1.7534617700180615e-05, |
|
"loss": 0.7658, |
|
"mean_token_accuracy": 0.7935622245073318, |
|
"num_tokens": 53616649.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.12492474413004215, |
|
"grad_norm": 0.8105073832679359, |
|
"learning_rate": 1.7504515352197474e-05, |
|
"loss": 0.7367, |
|
"mean_token_accuracy": 0.7998397067189217, |
|
"num_tokens": 54272009.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.12642986152919927, |
|
"grad_norm": 0.8683619505144872, |
|
"learning_rate": 1.747441300421433e-05, |
|
"loss": 0.7428, |
|
"mean_token_accuracy": 0.8006275922060013, |
|
"num_tokens": 54925539.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.12793497892835642, |
|
"grad_norm": 0.8155490993503403, |
|
"learning_rate": 1.7444310656231185e-05, |
|
"loss": 0.7924, |
|
"mean_token_accuracy": 0.7886917501688003, |
|
"num_tokens": 55579974.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.12944009632751355, |
|
"grad_norm": 0.8994872501062333, |
|
"learning_rate": 1.7414208308248044e-05, |
|
"loss": 0.7884, |
|
"mean_token_accuracy": 0.7879066273570061, |
|
"num_tokens": 56234346.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.13094521372667067, |
|
"grad_norm": 0.944052673697117, |
|
"learning_rate": 1.7384105960264903e-05, |
|
"loss": 0.7644, |
|
"mean_token_accuracy": 0.7921950727701187, |
|
"num_tokens": 56888265.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.13245033112582782, |
|
"grad_norm": 0.8348805534260644, |
|
"learning_rate": 1.735400361228176e-05, |
|
"loss": 0.7852, |
|
"mean_token_accuracy": 0.7882557049393654, |
|
"num_tokens": 57542064.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.13395544852498495, |
|
"grad_norm": 0.8685556781874473, |
|
"learning_rate": 1.7323901264298618e-05, |
|
"loss": 0.7643, |
|
"mean_token_accuracy": 0.7916644081473351, |
|
"num_tokens": 58191468.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.13546056592414207, |
|
"grad_norm": 0.8819964488751374, |
|
"learning_rate": 1.7293798916315473e-05, |
|
"loss": 0.7804, |
|
"mean_token_accuracy": 0.7900642126798629, |
|
"num_tokens": 58845375.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.13696568332329923, |
|
"grad_norm": 0.7706719597380793, |
|
"learning_rate": 1.7263696568332332e-05, |
|
"loss": 0.7468, |
|
"mean_token_accuracy": 0.7974156990647316, |
|
"num_tokens": 59499003.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.13847080072245635, |
|
"grad_norm": 0.8949708540928332, |
|
"learning_rate": 1.7233594220349188e-05, |
|
"loss": 0.7697, |
|
"mean_token_accuracy": 0.7942784354090691, |
|
"num_tokens": 60153143.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.13997591812161347, |
|
"grad_norm": 0.8938471826741746, |
|
"learning_rate": 1.7203491872366043e-05, |
|
"loss": 0.7565, |
|
"mean_token_accuracy": 0.7957235768437385, |
|
"num_tokens": 60808223.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.14148103552077063, |
|
"grad_norm": 0.8358286535449414, |
|
"learning_rate": 1.7173389524382902e-05, |
|
"loss": 0.767, |
|
"mean_token_accuracy": 0.7933147415518761, |
|
"num_tokens": 61461734.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.14298615291992775, |
|
"grad_norm": 0.8712281153910671, |
|
"learning_rate": 1.714328717639976e-05, |
|
"loss": 0.7638, |
|
"mean_token_accuracy": 0.7921429499983788, |
|
"num_tokens": 62116634.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1444912703190849, |
|
"grad_norm": 0.8684270174776013, |
|
"learning_rate": 1.7113184828416617e-05, |
|
"loss": 0.7897, |
|
"mean_token_accuracy": 0.7875265553593636, |
|
"num_tokens": 62771245.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.14599638771824203, |
|
"grad_norm": 0.7955045486378506, |
|
"learning_rate": 1.7083082480433476e-05, |
|
"loss": 0.7261, |
|
"mean_token_accuracy": 0.8014863416552543, |
|
"num_tokens": 63424706.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.14750150511739915, |
|
"grad_norm": 0.8899697106487319, |
|
"learning_rate": 1.705298013245033e-05, |
|
"loss": 0.7632, |
|
"mean_token_accuracy": 0.7941789865493775, |
|
"num_tokens": 64080066.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.1490066225165563, |
|
"grad_norm": 0.804285362005701, |
|
"learning_rate": 1.702287778446719e-05, |
|
"loss": 0.7726, |
|
"mean_token_accuracy": 0.7943894654512406, |
|
"num_tokens": 64733226.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.15051173991571343, |
|
"grad_norm": 0.8787668654860644, |
|
"learning_rate": 1.699277543648405e-05, |
|
"loss": 0.7698, |
|
"mean_token_accuracy": 0.7918394789099693, |
|
"num_tokens": 65387161.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15201685731487055, |
|
"grad_norm": 0.8352868459629883, |
|
"learning_rate": 1.6962673088500905e-05, |
|
"loss": 0.7785, |
|
"mean_token_accuracy": 0.79125065356493, |
|
"num_tokens": 66041447.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.1535219747140277, |
|
"grad_norm": 0.7673967975443655, |
|
"learning_rate": 1.693257074051776e-05, |
|
"loss": 0.757, |
|
"mean_token_accuracy": 0.7955381035804748, |
|
"num_tokens": 66695643.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.15502709211318483, |
|
"grad_norm": 0.9619399874547646, |
|
"learning_rate": 1.690246839253462e-05, |
|
"loss": 0.8063, |
|
"mean_token_accuracy": 0.7852592051029206, |
|
"num_tokens": 67349265.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.15653220951234195, |
|
"grad_norm": 0.862700449720151, |
|
"learning_rate": 1.6872366044551475e-05, |
|
"loss": 0.7515, |
|
"mean_token_accuracy": 0.7977708280086517, |
|
"num_tokens": 68004324.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.1580373269114991, |
|
"grad_norm": 0.8137469077952871, |
|
"learning_rate": 1.6842263696568334e-05, |
|
"loss": 0.7265, |
|
"mean_token_accuracy": 0.8019616097211838, |
|
"num_tokens": 68657707.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.15954244431065623, |
|
"grad_norm": 0.8704908427956669, |
|
"learning_rate": 1.6812161348585193e-05, |
|
"loss": 0.7652, |
|
"mean_token_accuracy": 0.7943032309412956, |
|
"num_tokens": 69309982.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.16104756170981335, |
|
"grad_norm": 0.8721161816172613, |
|
"learning_rate": 1.6782059000602048e-05, |
|
"loss": 0.7702, |
|
"mean_token_accuracy": 0.7941267043352127, |
|
"num_tokens": 69964695.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.1625526791089705, |
|
"grad_norm": 0.8189061107425057, |
|
"learning_rate": 1.6751956652618907e-05, |
|
"loss": 0.7685, |
|
"mean_token_accuracy": 0.7943713366985321, |
|
"num_tokens": 70620055.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.16405779650812763, |
|
"grad_norm": 0.8436846845627259, |
|
"learning_rate": 1.6721854304635763e-05, |
|
"loss": 0.7319, |
|
"mean_token_accuracy": 0.8026227414608001, |
|
"num_tokens": 71275415.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.16556291390728478, |
|
"grad_norm": 0.8274262661586408, |
|
"learning_rate": 1.669175195665262e-05, |
|
"loss": 0.7545, |
|
"mean_token_accuracy": 0.7971039965748787, |
|
"num_tokens": 71930775.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1670680313064419, |
|
"grad_norm": 0.8750608846258383, |
|
"learning_rate": 1.6661649608669477e-05, |
|
"loss": 0.7463, |
|
"mean_token_accuracy": 0.7971311554312706, |
|
"num_tokens": 72584266.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.16857314870559903, |
|
"grad_norm": 0.829213357340937, |
|
"learning_rate": 1.6631547260686336e-05, |
|
"loss": 0.7532, |
|
"mean_token_accuracy": 0.7960872635245323, |
|
"num_tokens": 73239626.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.17007826610475618, |
|
"grad_norm": 0.8094988426657819, |
|
"learning_rate": 1.6601444912703192e-05, |
|
"loss": 0.7601, |
|
"mean_token_accuracy": 0.7956007704138756, |
|
"num_tokens": 73892202.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.1715833835039133, |
|
"grad_norm": 0.8040984765694252, |
|
"learning_rate": 1.657134256472005e-05, |
|
"loss": 0.7277, |
|
"mean_token_accuracy": 0.8024196982383728, |
|
"num_tokens": 74547562.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.17308850090307043, |
|
"grad_norm": 0.9302568035038656, |
|
"learning_rate": 1.6541240216736906e-05, |
|
"loss": 0.7692, |
|
"mean_token_accuracy": 0.7915791377425194, |
|
"num_tokens": 75202922.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.17459361830222758, |
|
"grad_norm": 0.8529288739205021, |
|
"learning_rate": 1.6511137868753765e-05, |
|
"loss": 0.7633, |
|
"mean_token_accuracy": 0.794321759045124, |
|
"num_tokens": 75855977.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.1760987357013847, |
|
"grad_norm": 0.9492954962518395, |
|
"learning_rate": 1.648103552077062e-05, |
|
"loss": 0.7658, |
|
"mean_token_accuracy": 0.7943087443709373, |
|
"num_tokens": 76511337.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.17760385310054183, |
|
"grad_norm": 0.8701282778319047, |
|
"learning_rate": 1.6450933172787476e-05, |
|
"loss": 0.8226, |
|
"mean_token_accuracy": 0.780410946905613, |
|
"num_tokens": 77166563.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.17910897049969898, |
|
"grad_norm": 0.8747754872764684, |
|
"learning_rate": 1.6420830824804335e-05, |
|
"loss": 0.7607, |
|
"mean_token_accuracy": 0.7948632016777992, |
|
"num_tokens": 77821721.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.1806140878988561, |
|
"grad_norm": 0.7831429169639552, |
|
"learning_rate": 1.6390728476821194e-05, |
|
"loss": 0.7541, |
|
"mean_token_accuracy": 0.7957193419337273, |
|
"num_tokens": 78477081.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.18211920529801323, |
|
"grad_norm": 0.8432752703822518, |
|
"learning_rate": 1.636062612883805e-05, |
|
"loss": 0.7925, |
|
"mean_token_accuracy": 0.7887899950146675, |
|
"num_tokens": 79132441.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.18362432269717038, |
|
"grad_norm": 0.8504998453679667, |
|
"learning_rate": 1.633052378085491e-05, |
|
"loss": 0.7648, |
|
"mean_token_accuracy": 0.7930545896291733, |
|
"num_tokens": 79787010.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.1851294400963275, |
|
"grad_norm": 0.9010706298818607, |
|
"learning_rate": 1.6300421432871764e-05, |
|
"loss": 0.7745, |
|
"mean_token_accuracy": 0.7909213706851006, |
|
"num_tokens": 80440325.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.18663455749548466, |
|
"grad_norm": 0.8884532116760039, |
|
"learning_rate": 1.6270319084888623e-05, |
|
"loss": 0.7582, |
|
"mean_token_accuracy": 0.7969711780548095, |
|
"num_tokens": 81095685.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.18813967489464178, |
|
"grad_norm": 0.8037962066802894, |
|
"learning_rate": 1.6240216736905482e-05, |
|
"loss": 0.7608, |
|
"mean_token_accuracy": 0.7947753235697746, |
|
"num_tokens": 81750400.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.1896447922937989, |
|
"grad_norm": 0.8812056476404384, |
|
"learning_rate": 1.6210114388922338e-05, |
|
"loss": 0.7723, |
|
"mean_token_accuracy": 0.794083659350872, |
|
"num_tokens": 82405478.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.19114990969295606, |
|
"grad_norm": 0.9960349572295231, |
|
"learning_rate": 1.6180012040939193e-05, |
|
"loss": 0.7704, |
|
"mean_token_accuracy": 0.7944280609488488, |
|
"num_tokens": 83058284.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.19265502709211318, |
|
"grad_norm": 0.8677664262970063, |
|
"learning_rate": 1.6149909692956052e-05, |
|
"loss": 0.7835, |
|
"mean_token_accuracy": 0.7911761164665222, |
|
"num_tokens": 83713644.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1941601444912703, |
|
"grad_norm": 0.7916221716489966, |
|
"learning_rate": 1.6119807344972908e-05, |
|
"loss": 0.7442, |
|
"mean_token_accuracy": 0.7980069324374199, |
|
"num_tokens": 84366958.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.19566526189042746, |
|
"grad_norm": 0.8445952572495113, |
|
"learning_rate": 1.6089704996989767e-05, |
|
"loss": 0.7765, |
|
"mean_token_accuracy": 0.7921729102730751, |
|
"num_tokens": 85021614.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.19717037928958459, |
|
"grad_norm": 0.8925383428599953, |
|
"learning_rate": 1.6059602649006626e-05, |
|
"loss": 0.761, |
|
"mean_token_accuracy": 0.7955117270350456, |
|
"num_tokens": 85676974.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.1986754966887417, |
|
"grad_norm": 0.8841497219076043, |
|
"learning_rate": 1.602950030102348e-05, |
|
"loss": 0.7856, |
|
"mean_token_accuracy": 0.7890258222818375, |
|
"num_tokens": 86329344.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.20018061408789886, |
|
"grad_norm": 0.8277609911959379, |
|
"learning_rate": 1.599939795304034e-05, |
|
"loss": 0.7176, |
|
"mean_token_accuracy": 0.8031543210148812, |
|
"num_tokens": 86983588.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.20168573148705599, |
|
"grad_norm": 0.7762617521752686, |
|
"learning_rate": 1.5969295605057196e-05, |
|
"loss": 0.7262, |
|
"mean_token_accuracy": 0.8033408597111702, |
|
"num_tokens": 87638093.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.20319084888621314, |
|
"grad_norm": 0.8574797249960955, |
|
"learning_rate": 1.593919325707405e-05, |
|
"loss": 0.7721, |
|
"mean_token_accuracy": 0.7914051085710525, |
|
"num_tokens": 88293453.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.20469596628537026, |
|
"grad_norm": 0.8525932598240934, |
|
"learning_rate": 1.590909090909091e-05, |
|
"loss": 0.7692, |
|
"mean_token_accuracy": 0.7919760629534721, |
|
"num_tokens": 88947192.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.2062010836845274, |
|
"grad_norm": 0.8470054492079515, |
|
"learning_rate": 1.587898856110777e-05, |
|
"loss": 0.7487, |
|
"mean_token_accuracy": 0.7974001586437225, |
|
"num_tokens": 89602552.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.20770620108368454, |
|
"grad_norm": 0.7728762356874295, |
|
"learning_rate": 1.5848886213124625e-05, |
|
"loss": 0.7861, |
|
"mean_token_accuracy": 0.7887239217758178, |
|
"num_tokens": 90256955.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.20921131848284166, |
|
"grad_norm": 0.8411401203292108, |
|
"learning_rate": 1.5818783865141484e-05, |
|
"loss": 0.7432, |
|
"mean_token_accuracy": 0.800141978263855, |
|
"num_tokens": 90912315.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.2107164358819988, |
|
"grad_norm": 0.8028334181793002, |
|
"learning_rate": 1.578868151715834e-05, |
|
"loss": 0.727, |
|
"mean_token_accuracy": 0.8017411068081856, |
|
"num_tokens": 91566721.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.21222155328115594, |
|
"grad_norm": 0.9408129549604158, |
|
"learning_rate": 1.5758579169175198e-05, |
|
"loss": 0.7753, |
|
"mean_token_accuracy": 0.79281265437603, |
|
"num_tokens": 92222081.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.21372667068031306, |
|
"grad_norm": 0.870622637546394, |
|
"learning_rate": 1.5728476821192054e-05, |
|
"loss": 0.7391, |
|
"mean_token_accuracy": 0.7998394921422005, |
|
"num_tokens": 92873611.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.2152317880794702, |
|
"grad_norm": 0.9912659604734557, |
|
"learning_rate": 1.569837447320891e-05, |
|
"loss": 0.7608, |
|
"mean_token_accuracy": 0.7938985392451287, |
|
"num_tokens": 93526682.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.21673690547862734, |
|
"grad_norm": 0.8130531325261461, |
|
"learning_rate": 1.5668272125225768e-05, |
|
"loss": 0.7768, |
|
"mean_token_accuracy": 0.7910584717988968, |
|
"num_tokens": 94180965.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.21824202287778446, |
|
"grad_norm": 0.9191216718462245, |
|
"learning_rate": 1.5638169777242627e-05, |
|
"loss": 0.7246, |
|
"mean_token_accuracy": 0.8026754096150398, |
|
"num_tokens": 94835088.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.2197471402769416, |
|
"grad_norm": 0.8300249028967778, |
|
"learning_rate": 1.5608067429259483e-05, |
|
"loss": 0.7511, |
|
"mean_token_accuracy": 0.7983420848846435, |
|
"num_tokens": 95490448.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.22125225767609874, |
|
"grad_norm": 0.7951346672855284, |
|
"learning_rate": 1.557796508127634e-05, |
|
"loss": 0.7579, |
|
"mean_token_accuracy": 0.7955086678266525, |
|
"num_tokens": 96145808.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.22275737507525586, |
|
"grad_norm": 0.88803863878293, |
|
"learning_rate": 1.5547862733293197e-05, |
|
"loss": 0.7456, |
|
"mean_token_accuracy": 0.7965317487716674, |
|
"num_tokens": 96799113.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.22426249247441302, |
|
"grad_norm": 0.8643113106520561, |
|
"learning_rate": 1.5517760385310056e-05, |
|
"loss": 0.7592, |
|
"mean_token_accuracy": 0.794936190545559, |
|
"num_tokens": 97454473.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.22576760987357014, |
|
"grad_norm": 0.7656311573450891, |
|
"learning_rate": 1.5487658037326915e-05, |
|
"loss": 0.744, |
|
"mean_token_accuracy": 0.798383304476738, |
|
"num_tokens": 98109833.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.7807264440245045, |
|
"learning_rate": 1.545755568934377e-05, |
|
"loss": 0.7632, |
|
"mean_token_accuracy": 0.7929225742816925, |
|
"num_tokens": 98765193.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.22877784467188442, |
|
"grad_norm": 0.8577954072341681, |
|
"learning_rate": 1.5427453341360626e-05, |
|
"loss": 0.7809, |
|
"mean_token_accuracy": 0.7900413990020752, |
|
"num_tokens": 99417533.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.23028296207104154, |
|
"grad_norm": 0.7745693995209237, |
|
"learning_rate": 1.5397350993377485e-05, |
|
"loss": 0.7324, |
|
"mean_token_accuracy": 0.8006270915269852, |
|
"num_tokens": 100071717.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.23178807947019867, |
|
"grad_norm": 0.8269975042567316, |
|
"learning_rate": 1.536724864539434e-05, |
|
"loss": 0.7562, |
|
"mean_token_accuracy": 0.7966322675347328, |
|
"num_tokens": 100727077.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.23329319686935582, |
|
"grad_norm": 0.8059920867147564, |
|
"learning_rate": 1.53371462974112e-05, |
|
"loss": 0.7915, |
|
"mean_token_accuracy": 0.7891182228922844, |
|
"num_tokens": 101382437.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.23479831426851294, |
|
"grad_norm": 0.8646678410794064, |
|
"learning_rate": 1.530704394942806e-05, |
|
"loss": 0.7897, |
|
"mean_token_accuracy": 0.7905606806278229, |
|
"num_tokens": 102033756.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.23630343166767007, |
|
"grad_norm": 0.8547304753227982, |
|
"learning_rate": 1.5276941601444914e-05, |
|
"loss": 0.7752, |
|
"mean_token_accuracy": 0.790402115881443, |
|
"num_tokens": 102689116.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.23780854906682722, |
|
"grad_norm": 1.4190184748774233, |
|
"learning_rate": 1.5246839253461771e-05, |
|
"loss": 0.7318, |
|
"mean_token_accuracy": 0.8021197319030762, |
|
"num_tokens": 103342715.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.23931366646598434, |
|
"grad_norm": 0.7737584676524992, |
|
"learning_rate": 1.5216736905478629e-05, |
|
"loss": 0.7635, |
|
"mean_token_accuracy": 0.7938489958643913, |
|
"num_tokens": 103997145.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.2408187838651415, |
|
"grad_norm": 0.9280215259490882, |
|
"learning_rate": 1.5186634557495486e-05, |
|
"loss": 0.7452, |
|
"mean_token_accuracy": 0.7994018048048019, |
|
"num_tokens": 104649816.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.24232390126429862, |
|
"grad_norm": 0.8257952543204938, |
|
"learning_rate": 1.5156532209512343e-05, |
|
"loss": 0.7342, |
|
"mean_token_accuracy": 0.8016293570399284, |
|
"num_tokens": 105302893.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.24382901866345574, |
|
"grad_norm": 0.7807229346437049, |
|
"learning_rate": 1.5126429861529202e-05, |
|
"loss": 0.7469, |
|
"mean_token_accuracy": 0.7980134293437005, |
|
"num_tokens": 105956619.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.2453341360626129, |
|
"grad_norm": 0.9502338247595454, |
|
"learning_rate": 1.509632751354606e-05, |
|
"loss": 0.722, |
|
"mean_token_accuracy": 0.8015266269445419, |
|
"num_tokens": 106611979.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.24683925346177002, |
|
"grad_norm": 0.9054067925446354, |
|
"learning_rate": 1.5066225165562915e-05, |
|
"loss": 0.7834, |
|
"mean_token_accuracy": 0.7897364005446434, |
|
"num_tokens": 107266458.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.24834437086092714, |
|
"grad_norm": 0.8096932012825973, |
|
"learning_rate": 1.5036122817579772e-05, |
|
"loss": 0.7141, |
|
"mean_token_accuracy": 0.8049699932336807, |
|
"num_tokens": 107919891.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.2498494882600843, |
|
"grad_norm": 0.8111660189851574, |
|
"learning_rate": 1.500602046959663e-05, |
|
"loss": 0.7389, |
|
"mean_token_accuracy": 0.7994946867227555, |
|
"num_tokens": 108575251.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.25135460565924145, |
|
"grad_norm": 0.872485698341458, |
|
"learning_rate": 1.4975918121613487e-05, |
|
"loss": 0.7638, |
|
"mean_token_accuracy": 0.793247839808464, |
|
"num_tokens": 109229397.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.25285972305839854, |
|
"grad_norm": 0.7989740529438174, |
|
"learning_rate": 1.4945815773630344e-05, |
|
"loss": 0.7406, |
|
"mean_token_accuracy": 0.8004824161529541, |
|
"num_tokens": 109884757.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.2543648404575557, |
|
"grad_norm": 0.9962226601760668, |
|
"learning_rate": 1.4915713425647203e-05, |
|
"loss": 0.7389, |
|
"mean_token_accuracy": 0.8010228395462036, |
|
"num_tokens": 110539326.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.25586995785671285, |
|
"grad_norm": 0.8780144542163605, |
|
"learning_rate": 1.488561107766406e-05, |
|
"loss": 0.7597, |
|
"mean_token_accuracy": 0.7951804459095001, |
|
"num_tokens": 111194686.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.25737507525586995, |
|
"grad_norm": 0.843043910756463, |
|
"learning_rate": 1.4855508729680917e-05, |
|
"loss": 0.7545, |
|
"mean_token_accuracy": 0.7975330114364624, |
|
"num_tokens": 111849236.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.2588801926550271, |
|
"grad_norm": 0.8853179593156032, |
|
"learning_rate": 1.4825406381697773e-05, |
|
"loss": 0.7509, |
|
"mean_token_accuracy": 0.7967573747038841, |
|
"num_tokens": 112502230.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.26038531005418425, |
|
"grad_norm": 1.5017248864744905, |
|
"learning_rate": 1.479530403371463e-05, |
|
"loss": 0.7542, |
|
"mean_token_accuracy": 0.7963590011000633, |
|
"num_tokens": 113157590.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.26189042745334135, |
|
"grad_norm": 0.7446024351040114, |
|
"learning_rate": 1.4765201685731487e-05, |
|
"loss": 0.712, |
|
"mean_token_accuracy": 0.8053502306342125, |
|
"num_tokens": 113812483.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.2633955448524985, |
|
"grad_norm": 0.7781825583653845, |
|
"learning_rate": 1.4735099337748346e-05, |
|
"loss": 0.7794, |
|
"mean_token_accuracy": 0.7910980373620987, |
|
"num_tokens": 114467211.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.26490066225165565, |
|
"grad_norm": 0.8906574727322736, |
|
"learning_rate": 1.4704996989765203e-05, |
|
"loss": 0.7804, |
|
"mean_token_accuracy": 0.7923104777932167, |
|
"num_tokens": 115122474.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.26640577965081275, |
|
"grad_norm": 0.8521902207969323, |
|
"learning_rate": 1.467489464178206e-05, |
|
"loss": 0.7482, |
|
"mean_token_accuracy": 0.7965755835175514, |
|
"num_tokens": 115775617.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.2679108970499699, |
|
"grad_norm": 0.8391702368301299, |
|
"learning_rate": 1.4644792293798918e-05, |
|
"loss": 0.7342, |
|
"mean_token_accuracy": 0.8010420575737953, |
|
"num_tokens": 116427646.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.26941601444912705, |
|
"grad_norm": 0.7571804485530916, |
|
"learning_rate": 1.4614689945815773e-05, |
|
"loss": 0.7439, |
|
"mean_token_accuracy": 0.7991481438279152, |
|
"num_tokens": 117083006.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.27092113184828415, |
|
"grad_norm": 0.8948756751173403, |
|
"learning_rate": 1.458458759783263e-05, |
|
"loss": 0.7035, |
|
"mean_token_accuracy": 0.8078163161873817, |
|
"num_tokens": 117738366.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2724262492474413, |
|
"grad_norm": 0.8189089996058311, |
|
"learning_rate": 1.455448524984949e-05, |
|
"loss": 0.7333, |
|
"mean_token_accuracy": 0.8022503554821014, |
|
"num_tokens": 118390911.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.27393136664659845, |
|
"grad_norm": 0.8562867154390081, |
|
"learning_rate": 1.4524382901866347e-05, |
|
"loss": 0.6998, |
|
"mean_token_accuracy": 0.8098207741975785, |
|
"num_tokens": 119046271.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.27543648404575555, |
|
"grad_norm": 0.8000412239036848, |
|
"learning_rate": 1.4494280553883204e-05, |
|
"loss": 0.7593, |
|
"mean_token_accuracy": 0.7960475966334343, |
|
"num_tokens": 119700872.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.2769416014449127, |
|
"grad_norm": 0.7022083363095177, |
|
"learning_rate": 1.4464178205900061e-05, |
|
"loss": 0.7187, |
|
"mean_token_accuracy": 0.8039081588387489, |
|
"num_tokens": 120356232.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.27844671884406985, |
|
"grad_norm": 0.7330810854396546, |
|
"learning_rate": 1.4434075857916919e-05, |
|
"loss": 0.6802, |
|
"mean_token_accuracy": 0.811624014377594, |
|
"num_tokens": 121008241.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.27995183624322695, |
|
"grad_norm": 0.8709735932681155, |
|
"learning_rate": 1.4403973509933776e-05, |
|
"loss": 0.7587, |
|
"mean_token_accuracy": 0.7946249365806579, |
|
"num_tokens": 121662181.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.2814569536423841, |
|
"grad_norm": 0.7271993838315252, |
|
"learning_rate": 1.4373871161950635e-05, |
|
"loss": 0.7252, |
|
"mean_token_accuracy": 0.801104761660099, |
|
"num_tokens": 122316062.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.28296207104154125, |
|
"grad_norm": 0.9081751258700796, |
|
"learning_rate": 1.4343768813967492e-05, |
|
"loss": 0.7551, |
|
"mean_token_accuracy": 0.7959865048527718, |
|
"num_tokens": 122971422.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.28446718844069835, |
|
"grad_norm": 0.816281030997641, |
|
"learning_rate": 1.4313666465984348e-05, |
|
"loss": 0.74, |
|
"mean_token_accuracy": 0.797758474946022, |
|
"num_tokens": 123624695.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.2859723058398555, |
|
"grad_norm": 1.0641385504831533, |
|
"learning_rate": 1.4283564118001205e-05, |
|
"loss": 0.7221, |
|
"mean_token_accuracy": 0.8031127870082855, |
|
"num_tokens": 124280055.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.28747742323901265, |
|
"grad_norm": 0.8275815728201844, |
|
"learning_rate": 1.4253461770018062e-05, |
|
"loss": 0.7253, |
|
"mean_token_accuracy": 0.8021907031536102, |
|
"num_tokens": 124935415.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.2889825406381698, |
|
"grad_norm": 0.8414855000219864, |
|
"learning_rate": 1.422335942203492e-05, |
|
"loss": 0.7182, |
|
"mean_token_accuracy": 0.8061159908771515, |
|
"num_tokens": 125588888.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.2904876580373269, |
|
"grad_norm": 0.9158958808999972, |
|
"learning_rate": 1.4193257074051777e-05, |
|
"loss": 0.7462, |
|
"mean_token_accuracy": 0.7974677577614784, |
|
"num_tokens": 126241947.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.29199277543648405, |
|
"grad_norm": 0.8185830922905779, |
|
"learning_rate": 1.4163154726068636e-05, |
|
"loss": 0.7372, |
|
"mean_token_accuracy": 0.8008592411875725, |
|
"num_tokens": 126895869.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.2934978928356412, |
|
"grad_norm": 0.8125313384818498, |
|
"learning_rate": 1.4133052378085493e-05, |
|
"loss": 0.7335, |
|
"mean_token_accuracy": 0.7997892886400223, |
|
"num_tokens": 127549296.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.2950030102347983, |
|
"grad_norm": 0.8853228359721746, |
|
"learning_rate": 1.4102950030102348e-05, |
|
"loss": 0.7578, |
|
"mean_token_accuracy": 0.7951950415968895, |
|
"num_tokens": 128199122.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.29650812763395545, |
|
"grad_norm": 0.8312902050411218, |
|
"learning_rate": 1.4072847682119206e-05, |
|
"loss": 0.7349, |
|
"mean_token_accuracy": 0.8013306766748428, |
|
"num_tokens": 128852084.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.2980132450331126, |
|
"grad_norm": 0.9400229441738973, |
|
"learning_rate": 1.4042745334136063e-05, |
|
"loss": 0.7709, |
|
"mean_token_accuracy": 0.792121222615242, |
|
"num_tokens": 129505684.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.2995183624322697, |
|
"grad_norm": 0.767457194540183, |
|
"learning_rate": 1.401264298615292e-05, |
|
"loss": 0.7439, |
|
"mean_token_accuracy": 0.7998887673020363, |
|
"num_tokens": 130159751.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.30102347983142685, |
|
"grad_norm": 0.8181485482754092, |
|
"learning_rate": 1.3982540638169779e-05, |
|
"loss": 0.7695, |
|
"mean_token_accuracy": 0.793725848197937, |
|
"num_tokens": 130814051.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.302528597230584, |
|
"grad_norm": 0.8817939411380016, |
|
"learning_rate": 1.3952438290186636e-05, |
|
"loss": 0.7635, |
|
"mean_token_accuracy": 0.7946731060743332, |
|
"num_tokens": 131469233.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.3040337146297411, |
|
"grad_norm": 0.8545142349530216, |
|
"learning_rate": 1.3922335942203494e-05, |
|
"loss": 0.7649, |
|
"mean_token_accuracy": 0.7935700654983521, |
|
"num_tokens": 132124451.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.30553883202889826, |
|
"grad_norm": 0.7376149691050216, |
|
"learning_rate": 1.389223359422035e-05, |
|
"loss": 0.7016, |
|
"mean_token_accuracy": 0.8074377551674843, |
|
"num_tokens": 132779246.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.3070439494280554, |
|
"grad_norm": 0.8034691098129949, |
|
"learning_rate": 1.3862131246237206e-05, |
|
"loss": 0.7396, |
|
"mean_token_accuracy": 0.7991329744458199, |
|
"num_tokens": 133430264.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.3085490668272125, |
|
"grad_norm": 0.8410179177737847, |
|
"learning_rate": 1.3832028898254064e-05, |
|
"loss": 0.7354, |
|
"mean_token_accuracy": 0.800162672996521, |
|
"num_tokens": 134084988.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.31005418422636966, |
|
"grad_norm": 0.7567872331068635, |
|
"learning_rate": 1.3801926550270923e-05, |
|
"loss": 0.7419, |
|
"mean_token_accuracy": 0.8004732549190521, |
|
"num_tokens": 134740348.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.3115593016255268, |
|
"grad_norm": 0.8016571016171221, |
|
"learning_rate": 1.377182420228778e-05, |
|
"loss": 0.7298, |
|
"mean_token_accuracy": 0.8019894018769265, |
|
"num_tokens": 135392922.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.3130644190246839, |
|
"grad_norm": 0.719704414551152, |
|
"learning_rate": 1.3741721854304637e-05, |
|
"loss": 0.7449, |
|
"mean_token_accuracy": 0.7986428335309028, |
|
"num_tokens": 136048282.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.31456953642384106, |
|
"grad_norm": 0.8542849260567408, |
|
"learning_rate": 1.3711619506321494e-05, |
|
"loss": 0.7375, |
|
"mean_token_accuracy": 0.799374783039093, |
|
"num_tokens": 136702532.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.3160746538229982, |
|
"grad_norm": 0.8615669462955476, |
|
"learning_rate": 1.3681517158338352e-05, |
|
"loss": 0.7581, |
|
"mean_token_accuracy": 0.7959140941500664, |
|
"num_tokens": 137354096.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3175797712221553, |
|
"grad_norm": 0.8623192413813401, |
|
"learning_rate": 1.3651414810355209e-05, |
|
"loss": 0.7677, |
|
"mean_token_accuracy": 0.7939656764268875, |
|
"num_tokens": 138008866.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.31908488862131246, |
|
"grad_norm": 0.8638683869743261, |
|
"learning_rate": 1.3621312462372068e-05, |
|
"loss": 0.7854, |
|
"mean_token_accuracy": 0.7892706394195557, |
|
"num_tokens": 138662243.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.3205900060204696, |
|
"grad_norm": 0.7788945368357936, |
|
"learning_rate": 1.3591210114388925e-05, |
|
"loss": 0.7033, |
|
"mean_token_accuracy": 0.8076969027519226, |
|
"num_tokens": 139315867.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.3220951234196267, |
|
"grad_norm": 0.7810837509877583, |
|
"learning_rate": 1.356110776640578e-05, |
|
"loss": 0.7191, |
|
"mean_token_accuracy": 0.8021959617733956, |
|
"num_tokens": 139968814.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.32360024081878386, |
|
"grad_norm": 0.8230298462354105, |
|
"learning_rate": 1.3531005418422638e-05, |
|
"loss": 0.7472, |
|
"mean_token_accuracy": 0.7974795445799827, |
|
"num_tokens": 140624174.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.325105358217941, |
|
"grad_norm": 0.8121478510778044, |
|
"learning_rate": 1.3500903070439495e-05, |
|
"loss": 0.7275, |
|
"mean_token_accuracy": 0.8007212415337562, |
|
"num_tokens": 141278692.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.3266104756170981, |
|
"grad_norm": 0.7615542679915625, |
|
"learning_rate": 1.3470800722456352e-05, |
|
"loss": 0.695, |
|
"mean_token_accuracy": 0.8095154449343681, |
|
"num_tokens": 141932698.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.32811559301625526, |
|
"grad_norm": 0.8364388460827363, |
|
"learning_rate": 1.344069837447321e-05, |
|
"loss": 0.7535, |
|
"mean_token_accuracy": 0.7957231163978576, |
|
"num_tokens": 142583403.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.3296207104154124, |
|
"grad_norm": 0.8118011063357086, |
|
"learning_rate": 1.3410596026490068e-05, |
|
"loss": 0.7265, |
|
"mean_token_accuracy": 0.8028008803725243, |
|
"num_tokens": 143236811.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.33112582781456956, |
|
"grad_norm": 0.7816385932744847, |
|
"learning_rate": 1.3380493678506926e-05, |
|
"loss": 0.7087, |
|
"mean_token_accuracy": 0.8058973535895347, |
|
"num_tokens": 143892171.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.33263094521372666, |
|
"grad_norm": 0.7980286746369547, |
|
"learning_rate": 1.3350391330523781e-05, |
|
"loss": 0.7001, |
|
"mean_token_accuracy": 0.8083798885345459, |
|
"num_tokens": 144547279.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.3341360626128838, |
|
"grad_norm": 0.8103280863411036, |
|
"learning_rate": 1.3320288982540638e-05, |
|
"loss": 0.7147, |
|
"mean_token_accuracy": 0.8047249019145966, |
|
"num_tokens": 145202639.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.33564118001204096, |
|
"grad_norm": 0.7817312732868804, |
|
"learning_rate": 1.3290186634557496e-05, |
|
"loss": 0.7547, |
|
"mean_token_accuracy": 0.7974317491054534, |
|
"num_tokens": 145856181.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.33714629741119806, |
|
"grad_norm": 0.809969017978389, |
|
"learning_rate": 1.3260084286574353e-05, |
|
"loss": 0.7386, |
|
"mean_token_accuracy": 0.7985619261860848, |
|
"num_tokens": 146511541.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.3386514148103552, |
|
"grad_norm": 0.8423961245555179, |
|
"learning_rate": 1.3229981938591212e-05, |
|
"loss": 0.7413, |
|
"mean_token_accuracy": 0.8008396446704864, |
|
"num_tokens": 147166901.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.34015653220951236, |
|
"grad_norm": 0.8490396844604676, |
|
"learning_rate": 1.319987959060807e-05, |
|
"loss": 0.7428, |
|
"mean_token_accuracy": 0.7985736206173897, |
|
"num_tokens": 147821517.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.34166164960866946, |
|
"grad_norm": 0.9544565589432954, |
|
"learning_rate": 1.3169777242624926e-05, |
|
"loss": 0.71, |
|
"mean_token_accuracy": 0.8067162126302719, |
|
"num_tokens": 148475213.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.3431667670078266, |
|
"grad_norm": 0.8532463095029145, |
|
"learning_rate": 1.3139674894641784e-05, |
|
"loss": 0.7629, |
|
"mean_token_accuracy": 0.7937496155500412, |
|
"num_tokens": 149129302.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.34467188440698376, |
|
"grad_norm": 0.8697931847348629, |
|
"learning_rate": 1.310957254665864e-05, |
|
"loss": 0.7561, |
|
"mean_token_accuracy": 0.7952659383416176, |
|
"num_tokens": 149784662.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.34617700180614086, |
|
"grad_norm": 0.8108775998481546, |
|
"learning_rate": 1.3079470198675496e-05, |
|
"loss": 0.7367, |
|
"mean_token_accuracy": 0.800705449283123, |
|
"num_tokens": 150438278.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.347682119205298, |
|
"grad_norm": 0.8438997210922776, |
|
"learning_rate": 1.3049367850692355e-05, |
|
"loss": 0.7235, |
|
"mean_token_accuracy": 0.8016945570707321, |
|
"num_tokens": 151093638.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.34918723660445516, |
|
"grad_norm": 0.7471884183215057, |
|
"learning_rate": 1.3019265502709213e-05, |
|
"loss": 0.6997, |
|
"mean_token_accuracy": 0.8057456374168396, |
|
"num_tokens": 151748101.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.35069235400361226, |
|
"grad_norm": 0.8204306141235812, |
|
"learning_rate": 1.298916315472607e-05, |
|
"loss": 0.7182, |
|
"mean_token_accuracy": 0.8028990581631661, |
|
"num_tokens": 152403461.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.3521974714027694, |
|
"grad_norm": 0.7380655579017532, |
|
"learning_rate": 1.2959060806742927e-05, |
|
"loss": 0.7054, |
|
"mean_token_accuracy": 0.8069384634494782, |
|
"num_tokens": 153055319.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.35370258880192657, |
|
"grad_norm": 0.8820967882531356, |
|
"learning_rate": 1.2928958458759784e-05, |
|
"loss": 0.7534, |
|
"mean_token_accuracy": 0.7969329059123993, |
|
"num_tokens": 153707035.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.35520770620108366, |
|
"grad_norm": 0.8315059717289699, |
|
"learning_rate": 1.289885611077664e-05, |
|
"loss": 0.7187, |
|
"mean_token_accuracy": 0.8047081142663955, |
|
"num_tokens": 154362395.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.3567128236002408, |
|
"grad_norm": 0.8123195898096921, |
|
"learning_rate": 1.28687537627935e-05, |
|
"loss": 0.7531, |
|
"mean_token_accuracy": 0.7959761649370194, |
|
"num_tokens": 155016494.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.35821794099939797, |
|
"grad_norm": 0.8707921062012124, |
|
"learning_rate": 1.2838651414810356e-05, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.8021174252033234, |
|
"num_tokens": 155671854.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.35972305839855506, |
|
"grad_norm": 0.8506282480491654, |
|
"learning_rate": 1.2808549066827213e-05, |
|
"loss": 0.7901, |
|
"mean_token_accuracy": 0.7879240825772286, |
|
"num_tokens": 156324314.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.3612281757977122, |
|
"grad_norm": 0.8426621488681493, |
|
"learning_rate": 1.277844671884407e-05, |
|
"loss": 0.7249, |
|
"mean_token_accuracy": 0.8012265384197235, |
|
"num_tokens": 156979374.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.36273329319686937, |
|
"grad_norm": 0.8133510365826947, |
|
"learning_rate": 1.2748344370860928e-05, |
|
"loss": 0.7323, |
|
"mean_token_accuracy": 0.8001759797334671, |
|
"num_tokens": 157632571.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.36423841059602646, |
|
"grad_norm": 0.915090508369899, |
|
"learning_rate": 1.2718242022877785e-05, |
|
"loss": 0.7696, |
|
"mean_token_accuracy": 0.7921423956751823, |
|
"num_tokens": 158287219.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.3657435279951836, |
|
"grad_norm": 0.8328754560870167, |
|
"learning_rate": 1.2688139674894642e-05, |
|
"loss": 0.7001, |
|
"mean_token_accuracy": 0.80736443400383, |
|
"num_tokens": 158942579.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.36724864539434077, |
|
"grad_norm": 0.7874069005602204, |
|
"learning_rate": 1.2658037326911501e-05, |
|
"loss": 0.7054, |
|
"mean_token_accuracy": 0.8079196363687515, |
|
"num_tokens": 159596538.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.3687537627934979, |
|
"grad_norm": 0.8371364466108957, |
|
"learning_rate": 1.2627934978928359e-05, |
|
"loss": 0.7674, |
|
"mean_token_accuracy": 0.793084391951561, |
|
"num_tokens": 160251898.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.370258880192655, |
|
"grad_norm": 0.8051959753781694, |
|
"learning_rate": 1.2597832630945214e-05, |
|
"loss": 0.7008, |
|
"mean_token_accuracy": 0.8092971444129944, |
|
"num_tokens": 160907258.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.37176399759181217, |
|
"grad_norm": 0.7887258704761677, |
|
"learning_rate": 1.2567730282962071e-05, |
|
"loss": 0.7584, |
|
"mean_token_accuracy": 0.7960580214858055, |
|
"num_tokens": 161562543.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.3732691149909693, |
|
"grad_norm": 0.8142522899450826, |
|
"learning_rate": 1.2537627934978929e-05, |
|
"loss": 0.7244, |
|
"mean_token_accuracy": 0.8027479246258735, |
|
"num_tokens": 162217903.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.3747742323901264, |
|
"grad_norm": 0.7897562813440631, |
|
"learning_rate": 1.2507525586995786e-05, |
|
"loss": 0.7025, |
|
"mean_token_accuracy": 0.8074427857995033, |
|
"num_tokens": 162871372.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.37627934978928357, |
|
"grad_norm": 0.852644589558439, |
|
"learning_rate": 1.2477423239012645e-05, |
|
"loss": 0.6673, |
|
"mean_token_accuracy": 0.816796886920929, |
|
"num_tokens": 163523919.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3777844671884407, |
|
"grad_norm": 0.7204306408273152, |
|
"learning_rate": 1.2447320891029502e-05, |
|
"loss": 0.7047, |
|
"mean_token_accuracy": 0.806460677087307, |
|
"num_tokens": 164179279.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.3792895845875978, |
|
"grad_norm": 0.7560565919266367, |
|
"learning_rate": 1.241721854304636e-05, |
|
"loss": 0.7379, |
|
"mean_token_accuracy": 0.7992819055914879, |
|
"num_tokens": 164834053.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.38079470198675497, |
|
"grad_norm": 0.7735931141038619, |
|
"learning_rate": 1.2387116195063217e-05, |
|
"loss": 0.7063, |
|
"mean_token_accuracy": 0.8051950618624687, |
|
"num_tokens": 165486971.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.3822998193859121, |
|
"grad_norm": 0.841653983169311, |
|
"learning_rate": 1.2357013847080072e-05, |
|
"loss": 0.7876, |
|
"mean_token_accuracy": 0.7882312595844269, |
|
"num_tokens": 166142331.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.3838049367850692, |
|
"grad_norm": 0.7434911792385135, |
|
"learning_rate": 1.232691149909693e-05, |
|
"loss": 0.7465, |
|
"mean_token_accuracy": 0.7978531375527382, |
|
"num_tokens": 166797089.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.38531005418422637, |
|
"grad_norm": 0.7726472609806933, |
|
"learning_rate": 1.2296809151113788e-05, |
|
"loss": 0.7462, |
|
"mean_token_accuracy": 0.797703605890274, |
|
"num_tokens": 167451719.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.3868151715833835, |
|
"grad_norm": 0.9560931797627115, |
|
"learning_rate": 1.2266706803130646e-05, |
|
"loss": 0.7366, |
|
"mean_token_accuracy": 0.7992733284831047, |
|
"num_tokens": 168107079.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.3883202889825406, |
|
"grad_norm": 0.879404934750527, |
|
"learning_rate": 1.2236604455147503e-05, |
|
"loss": 0.7255, |
|
"mean_token_accuracy": 0.8031265258789062, |
|
"num_tokens": 168762439.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.38982540638169777, |
|
"grad_norm": 0.8202229177395133, |
|
"learning_rate": 1.220650210716436e-05, |
|
"loss": 0.7146, |
|
"mean_token_accuracy": 0.8043844655156136, |
|
"num_tokens": 169417799.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.3913305237808549, |
|
"grad_norm": 0.8098944443556049, |
|
"learning_rate": 1.2176399759181217e-05, |
|
"loss": 0.7035, |
|
"mean_token_accuracy": 0.8080252036452293, |
|
"num_tokens": 170070400.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.392835641180012, |
|
"grad_norm": 0.8047659285544517, |
|
"learning_rate": 1.2146297411198073e-05, |
|
"loss": 0.7066, |
|
"mean_token_accuracy": 0.8052042603492737, |
|
"num_tokens": 170725760.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.39434075857916917, |
|
"grad_norm": 0.8226203658827208, |
|
"learning_rate": 1.2116195063214933e-05, |
|
"loss": 0.7544, |
|
"mean_token_accuracy": 0.7957366958260537, |
|
"num_tokens": 171378113.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.3958458759783263, |
|
"grad_norm": 0.8099664607141047, |
|
"learning_rate": 1.2086092715231789e-05, |
|
"loss": 0.7307, |
|
"mean_token_accuracy": 0.802658586204052, |
|
"num_tokens": 172031920.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.3973509933774834, |
|
"grad_norm": 0.7010950454683768, |
|
"learning_rate": 1.2055990367248646e-05, |
|
"loss": 0.7235, |
|
"mean_token_accuracy": 0.8023072630167007, |
|
"num_tokens": 172687010.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.39885611077664057, |
|
"grad_norm": 0.7112213084350201, |
|
"learning_rate": 1.2025888019265504e-05, |
|
"loss": 0.7487, |
|
"mean_token_accuracy": 0.7985242143273353, |
|
"num_tokens": 173341760.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.4003612281757977, |
|
"grad_norm": 0.7866431133317365, |
|
"learning_rate": 1.199578567128236e-05, |
|
"loss": 0.7124, |
|
"mean_token_accuracy": 0.8047997072339058, |
|
"num_tokens": 173997120.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.4018663455749548, |
|
"grad_norm": 0.7958686340138259, |
|
"learning_rate": 1.1965683323299218e-05, |
|
"loss": 0.7264, |
|
"mean_token_accuracy": 0.8047920733690261, |
|
"num_tokens": 174652480.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.40337146297411197, |
|
"grad_norm": 0.7931937228548085, |
|
"learning_rate": 1.1935580975316077e-05, |
|
"loss": 0.7166, |
|
"mean_token_accuracy": 0.804453332722187, |
|
"num_tokens": 175307051.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.4048765803732691, |
|
"grad_norm": 0.8414047133010927, |
|
"learning_rate": 1.1905478627332934e-05, |
|
"loss": 0.722, |
|
"mean_token_accuracy": 0.803479178249836, |
|
"num_tokens": 175962411.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.4063816977724263, |
|
"grad_norm": 0.7700579867503355, |
|
"learning_rate": 1.1875376279349791e-05, |
|
"loss": 0.7377, |
|
"mean_token_accuracy": 0.7987645655870438, |
|
"num_tokens": 176615394.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4078868151715834, |
|
"grad_norm": 0.8302573541607223, |
|
"learning_rate": 1.1845273931366647e-05, |
|
"loss": 0.7472, |
|
"mean_token_accuracy": 0.7968945801258087, |
|
"num_tokens": 177269102.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.4093919325707405, |
|
"grad_norm": 0.8765674423163995, |
|
"learning_rate": 1.1815171583383504e-05, |
|
"loss": 0.7945, |
|
"mean_token_accuracy": 0.7873334854841232, |
|
"num_tokens": 177922233.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.4108970499698977, |
|
"grad_norm": 0.8043233967618197, |
|
"learning_rate": 1.1785069235400361e-05, |
|
"loss": 0.686, |
|
"mean_token_accuracy": 0.8096742391586303, |
|
"num_tokens": 178575911.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.4124021673690548, |
|
"grad_norm": 0.8721399678669114, |
|
"learning_rate": 1.1754966887417219e-05, |
|
"loss": 0.717, |
|
"mean_token_accuracy": 0.8032816737890244, |
|
"num_tokens": 179229495.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.4139072847682119, |
|
"grad_norm": 0.8099202933603213, |
|
"learning_rate": 1.1724864539434078e-05, |
|
"loss": 0.6934, |
|
"mean_token_accuracy": 0.810499781370163, |
|
"num_tokens": 179883115.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.4154124021673691, |
|
"grad_norm": 0.7882042117138398, |
|
"learning_rate": 1.1694762191450935e-05, |
|
"loss": 0.7385, |
|
"mean_token_accuracy": 0.800557217001915, |
|
"num_tokens": 180538475.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.4169175195665262, |
|
"grad_norm": 0.8264936012665707, |
|
"learning_rate": 1.1664659843467792e-05, |
|
"loss": 0.7101, |
|
"mean_token_accuracy": 0.8055298551917076, |
|
"num_tokens": 181193709.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.4184226369656833, |
|
"grad_norm": 0.7635453349114087, |
|
"learning_rate": 1.1634557495484648e-05, |
|
"loss": 0.7189, |
|
"mean_token_accuracy": 0.8022925585508347, |
|
"num_tokens": 181848502.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.4199277543648405, |
|
"grad_norm": 0.8004600385192592, |
|
"learning_rate": 1.1604455147501505e-05, |
|
"loss": 0.7308, |
|
"mean_token_accuracy": 0.8010586395859718, |
|
"num_tokens": 182502476.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.4214328717639976, |
|
"grad_norm": 0.7193388253714986, |
|
"learning_rate": 1.1574352799518362e-05, |
|
"loss": 0.7066, |
|
"mean_token_accuracy": 0.8055385872721672, |
|
"num_tokens": 183157836.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.4229379891631547, |
|
"grad_norm": 0.9009713817079101, |
|
"learning_rate": 1.1544250451535221e-05, |
|
"loss": 0.7463, |
|
"mean_token_accuracy": 0.7980786472558975, |
|
"num_tokens": 183811497.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.4244431065623119, |
|
"grad_norm": 0.776187136186914, |
|
"learning_rate": 1.1514148103552078e-05, |
|
"loss": 0.6957, |
|
"mean_token_accuracy": 0.8083595156669616, |
|
"num_tokens": 184463896.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.425948223961469, |
|
"grad_norm": 0.8130038795159226, |
|
"learning_rate": 1.1484045755568936e-05, |
|
"loss": 0.7635, |
|
"mean_token_accuracy": 0.7954803004860878, |
|
"num_tokens": 185117476.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.4274533413606261, |
|
"grad_norm": 0.7564234601113903, |
|
"learning_rate": 1.1453943407585793e-05, |
|
"loss": 0.7116, |
|
"mean_token_accuracy": 0.8060019329190254, |
|
"num_tokens": 185771283.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.4289584587597833, |
|
"grad_norm": 0.906398593106907, |
|
"learning_rate": 1.142384105960265e-05, |
|
"loss": 0.7296, |
|
"mean_token_accuracy": 0.8021451219916343, |
|
"num_tokens": 186425549.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.4304635761589404, |
|
"grad_norm": 0.8200395489339062, |
|
"learning_rate": 1.1393738711619506e-05, |
|
"loss": 0.7595, |
|
"mean_token_accuracy": 0.7947599649429321, |
|
"num_tokens": 187078321.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.4319686935580975, |
|
"grad_norm": 0.8290871333202676, |
|
"learning_rate": 1.1363636363636366e-05, |
|
"loss": 0.7565, |
|
"mean_token_accuracy": 0.7961515039205551, |
|
"num_tokens": 187731135.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.4334738109572547, |
|
"grad_norm": 0.7911587392676075, |
|
"learning_rate": 1.1333534015653222e-05, |
|
"loss": 0.7449, |
|
"mean_token_accuracy": 0.7980863243341446, |
|
"num_tokens": 188384784.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.4349789283564118, |
|
"grad_norm": 0.711973469569701, |
|
"learning_rate": 1.1303431667670079e-05, |
|
"loss": 0.7251, |
|
"mean_token_accuracy": 0.8034043282270431, |
|
"num_tokens": 189039542.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.43648404575556893, |
|
"grad_norm": 0.8341450761931715, |
|
"learning_rate": 1.1273329319686936e-05, |
|
"loss": 0.7024, |
|
"mean_token_accuracy": 0.8091023206710816, |
|
"num_tokens": 189690067.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4379891631547261, |
|
"grad_norm": 0.8160268015867358, |
|
"learning_rate": 1.1243226971703794e-05, |
|
"loss": 0.7087, |
|
"mean_token_accuracy": 0.8077363967895508, |
|
"num_tokens": 190344885.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.4394942805538832, |
|
"grad_norm": 0.835780708497786, |
|
"learning_rate": 1.1213124623720651e-05, |
|
"loss": 0.7091, |
|
"mean_token_accuracy": 0.8066539317369461, |
|
"num_tokens": 190997515.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.44099939795304033, |
|
"grad_norm": 0.7730508882826491, |
|
"learning_rate": 1.118302227573751e-05, |
|
"loss": 0.7519, |
|
"mean_token_accuracy": 0.7966817542910576, |
|
"num_tokens": 191652638.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.4425045153521975, |
|
"grad_norm": 0.7989264828087985, |
|
"learning_rate": 1.1152919927754367e-05, |
|
"loss": 0.7084, |
|
"mean_token_accuracy": 0.8050470232963562, |
|
"num_tokens": 192307998.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.44400963275135463, |
|
"grad_norm": 0.7475676253652657, |
|
"learning_rate": 1.1122817579771223e-05, |
|
"loss": 0.7064, |
|
"mean_token_accuracy": 0.804878756403923, |
|
"num_tokens": 192961266.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.44551475015051173, |
|
"grad_norm": 0.7886808863615066, |
|
"learning_rate": 1.109271523178808e-05, |
|
"loss": 0.717, |
|
"mean_token_accuracy": 0.8057780176401138, |
|
"num_tokens": 193615763.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.4470198675496689, |
|
"grad_norm": 0.8125274746428679, |
|
"learning_rate": 1.1062612883804937e-05, |
|
"loss": 0.7114, |
|
"mean_token_accuracy": 0.8053019717335701, |
|
"num_tokens": 194271123.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.44852498494882603, |
|
"grad_norm": 0.8260302921388075, |
|
"learning_rate": 1.1032510535821794e-05, |
|
"loss": 0.7267, |
|
"mean_token_accuracy": 0.8016424849629402, |
|
"num_tokens": 194925375.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.45003010234798313, |
|
"grad_norm": 0.8130548390682162, |
|
"learning_rate": 1.1002408187838652e-05, |
|
"loss": 0.7174, |
|
"mean_token_accuracy": 0.8041437566280365, |
|
"num_tokens": 195579677.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.4515352197471403, |
|
"grad_norm": 0.7150220204150168, |
|
"learning_rate": 1.097230583985551e-05, |
|
"loss": 0.7132, |
|
"mean_token_accuracy": 0.8052812933921814, |
|
"num_tokens": 196233104.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.45304033714629743, |
|
"grad_norm": 0.8186467059744715, |
|
"learning_rate": 1.0942203491872368e-05, |
|
"loss": 0.7536, |
|
"mean_token_accuracy": 0.796061310172081, |
|
"num_tokens": 196888464.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.8030232699110134, |
|
"learning_rate": 1.0912101143889225e-05, |
|
"loss": 0.7156, |
|
"mean_token_accuracy": 0.8049913555383682, |
|
"num_tokens": 197542844.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.4560505719446117, |
|
"grad_norm": 0.7613073600567963, |
|
"learning_rate": 1.088199879590608e-05, |
|
"loss": 0.7041, |
|
"mean_token_accuracy": 0.8063686951994896, |
|
"num_tokens": 198197017.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.45755568934376883, |
|
"grad_norm": 0.8366714650181802, |
|
"learning_rate": 1.0851896447922938e-05, |
|
"loss": 0.7616, |
|
"mean_token_accuracy": 0.7939637005329132, |
|
"num_tokens": 198851028.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.45906080674292593, |
|
"grad_norm": 0.8638117040840707, |
|
"learning_rate": 1.0821794099939795e-05, |
|
"loss": 0.7173, |
|
"mean_token_accuracy": 0.8043005108833313, |
|
"num_tokens": 199504777.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.4605659241420831, |
|
"grad_norm": 0.8203104992903841, |
|
"learning_rate": 1.0791691751956654e-05, |
|
"loss": 0.7358, |
|
"mean_token_accuracy": 0.8005827903747559, |
|
"num_tokens": 200157604.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.46207104154124023, |
|
"grad_norm": 0.8144542650002136, |
|
"learning_rate": 1.0761589403973511e-05, |
|
"loss": 0.7416, |
|
"mean_token_accuracy": 0.7981523618102073, |
|
"num_tokens": 200809091.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.46357615894039733, |
|
"grad_norm": 0.7945409434106105, |
|
"learning_rate": 1.0731487055990369e-05, |
|
"loss": 0.6875, |
|
"mean_token_accuracy": 0.8094358786940574, |
|
"num_tokens": 201464171.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.4650812763395545, |
|
"grad_norm": 0.8730517194559562, |
|
"learning_rate": 1.0701384708007226e-05, |
|
"loss": 0.7183, |
|
"mean_token_accuracy": 0.8042961657047272, |
|
"num_tokens": 202119072.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.46658639373871164, |
|
"grad_norm": 0.8626403160499244, |
|
"learning_rate": 1.0671282360024083e-05, |
|
"loss": 0.7167, |
|
"mean_token_accuracy": 0.8044699609279633, |
|
"num_tokens": 202774432.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.46809151113786873, |
|
"grad_norm": 0.8523186755289315, |
|
"learning_rate": 1.0641180012040939e-05, |
|
"loss": 0.6908, |
|
"mean_token_accuracy": 0.8096819415688514, |
|
"num_tokens": 203427227.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.4695966285370259, |
|
"grad_norm": 0.8645519988255309, |
|
"learning_rate": 1.06110776640578e-05, |
|
"loss": 0.7216, |
|
"mean_token_accuracy": 0.8030843511223793, |
|
"num_tokens": 204081611.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.47110174593618304, |
|
"grad_norm": 0.7653421808114811, |
|
"learning_rate": 1.0580975316074655e-05, |
|
"loss": 0.6535, |
|
"mean_token_accuracy": 0.819001467525959, |
|
"num_tokens": 204736028.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.47260686333534013, |
|
"grad_norm": 0.7263321192874824, |
|
"learning_rate": 1.0550872968091512e-05, |
|
"loss": 0.7242, |
|
"mean_token_accuracy": 0.8027195662260056, |
|
"num_tokens": 205387916.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.4741119807344973, |
|
"grad_norm": 0.7601599258449856, |
|
"learning_rate": 1.052077062010837e-05, |
|
"loss": 0.7141, |
|
"mean_token_accuracy": 0.8052065283060074, |
|
"num_tokens": 206042288.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.47561709813365444, |
|
"grad_norm": 0.8000199112319787, |
|
"learning_rate": 1.0490668272125227e-05, |
|
"loss": 0.7296, |
|
"mean_token_accuracy": 0.8023619994521141, |
|
"num_tokens": 206696451.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.47712221553281153, |
|
"grad_norm": 0.8129807447364599, |
|
"learning_rate": 1.0460565924142084e-05, |
|
"loss": 0.7576, |
|
"mean_token_accuracy": 0.7973424136638642, |
|
"num_tokens": 207351750.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.4786273329319687, |
|
"grad_norm": 0.7658578926533476, |
|
"learning_rate": 1.0430463576158943e-05, |
|
"loss": 0.7211, |
|
"mean_token_accuracy": 0.8033246964216232, |
|
"num_tokens": 208005316.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.48013245033112584, |
|
"grad_norm": 0.8168158779578062, |
|
"learning_rate": 1.04003612281758e-05, |
|
"loss": 0.6937, |
|
"mean_token_accuracy": 0.8084788709878922, |
|
"num_tokens": 208660676.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.481637567730283, |
|
"grad_norm": 1.0052382018557156, |
|
"learning_rate": 1.0370258880192655e-05, |
|
"loss": 0.7248, |
|
"mean_token_accuracy": 0.8030228197574616, |
|
"num_tokens": 209312522.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.4831426851294401, |
|
"grad_norm": 0.7976089434726377, |
|
"learning_rate": 1.0340156532209513e-05, |
|
"loss": 0.6868, |
|
"mean_token_accuracy": 0.8124389365315438, |
|
"num_tokens": 209967882.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.48464780252859724, |
|
"grad_norm": 0.8229870659165623, |
|
"learning_rate": 1.031005418422637e-05, |
|
"loss": 0.7505, |
|
"mean_token_accuracy": 0.7961588889360428, |
|
"num_tokens": 210622270.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.4861529199277544, |
|
"grad_norm": 0.7747505536267031, |
|
"learning_rate": 1.0279951836243227e-05, |
|
"loss": 0.7026, |
|
"mean_token_accuracy": 0.807074373960495, |
|
"num_tokens": 211277630.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.4876580373269115, |
|
"grad_norm": 0.8871086206356764, |
|
"learning_rate": 1.0249849488260084e-05, |
|
"loss": 0.6974, |
|
"mean_token_accuracy": 0.8071722269058228, |
|
"num_tokens": 211930855.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.48916315472606864, |
|
"grad_norm": 0.8155925425450639, |
|
"learning_rate": 1.0219747140276943e-05, |
|
"loss": 0.7116, |
|
"mean_token_accuracy": 0.8057066261768341, |
|
"num_tokens": 212585259.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.4906682721252258, |
|
"grad_norm": 0.7849356966516002, |
|
"learning_rate": 1.01896447922938e-05, |
|
"loss": 0.7529, |
|
"mean_token_accuracy": 0.7978712409734726, |
|
"num_tokens": 213238834.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.4921733895243829, |
|
"grad_norm": 0.7736249961666276, |
|
"learning_rate": 1.0159542444310658e-05, |
|
"loss": 0.7183, |
|
"mean_token_accuracy": 0.8035428315401077, |
|
"num_tokens": 213892454.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.49367850692354004, |
|
"grad_norm": 0.8871576564844669, |
|
"learning_rate": 1.0129440096327513e-05, |
|
"loss": 0.7175, |
|
"mean_token_accuracy": 0.8037295624613762, |
|
"num_tokens": 214546735.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.4951836243226972, |
|
"grad_norm": 0.7820627438145885, |
|
"learning_rate": 1.009933774834437e-05, |
|
"loss": 0.704, |
|
"mean_token_accuracy": 0.806598387658596, |
|
"num_tokens": 215199700.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.4966887417218543, |
|
"grad_norm": 0.8144225434992196, |
|
"learning_rate": 1.0069235400361228e-05, |
|
"loss": 0.7196, |
|
"mean_token_accuracy": 0.8028386145830154, |
|
"num_tokens": 215854392.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.49819385912101144, |
|
"grad_norm": 0.8302919269226839, |
|
"learning_rate": 1.0039133052378087e-05, |
|
"loss": 0.6989, |
|
"mean_token_accuracy": 0.8083430036902428, |
|
"num_tokens": 216509752.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.4996989765201686, |
|
"grad_norm": 0.859098671961538, |
|
"learning_rate": 1.0009030704394944e-05, |
|
"loss": 0.6834, |
|
"mean_token_accuracy": 0.8109419390559196, |
|
"num_tokens": 217162720.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.5012040939193257, |
|
"grad_norm": 0.9124368923411915, |
|
"learning_rate": 9.978928356411801e-06, |
|
"loss": 0.753, |
|
"mean_token_accuracy": 0.7990192532539367, |
|
"num_tokens": 217816899.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.5027092113184829, |
|
"grad_norm": 0.876541714838967, |
|
"learning_rate": 9.948826008428659e-06, |
|
"loss": 0.6949, |
|
"mean_token_accuracy": 0.8079330369830131, |
|
"num_tokens": 218469392.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.5042143287176399, |
|
"grad_norm": 0.8237912305252111, |
|
"learning_rate": 9.918723660445514e-06, |
|
"loss": 0.7008, |
|
"mean_token_accuracy": 0.8087765663862229, |
|
"num_tokens": 219124752.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.5057194461167971, |
|
"grad_norm": 0.8805985124073732, |
|
"learning_rate": 9.888621312462373e-06, |
|
"loss": 0.737, |
|
"mean_token_accuracy": 0.7990552827715873, |
|
"num_tokens": 219778166.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.5072245635159542, |
|
"grad_norm": 0.8021126229570424, |
|
"learning_rate": 9.85851896447923e-06, |
|
"loss": 0.6971, |
|
"mean_token_accuracy": 0.8074719130992889, |
|
"num_tokens": 220431930.0, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.5087296809151114, |
|
"grad_norm": 0.7972712654677118, |
|
"learning_rate": 9.828416616496088e-06, |
|
"loss": 0.6956, |
|
"mean_token_accuracy": 0.8087294608354568, |
|
"num_tokens": 221085830.0, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.5102347983142685, |
|
"grad_norm": 0.8594868396631411, |
|
"learning_rate": 9.798314268512945e-06, |
|
"loss": 0.6783, |
|
"mean_token_accuracy": 0.8133060529828071, |
|
"num_tokens": 221741190.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.5117399157134257, |
|
"grad_norm": 0.8195675779575067, |
|
"learning_rate": 9.768211920529802e-06, |
|
"loss": 0.7259, |
|
"mean_token_accuracy": 0.8023128375411034, |
|
"num_tokens": 222396550.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5132450331125827, |
|
"grad_norm": 0.8239469426553997, |
|
"learning_rate": 9.73810957254666e-06, |
|
"loss": 0.6731, |
|
"mean_token_accuracy": 0.8141262501478195, |
|
"num_tokens": 223050376.0, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.5147501505117399, |
|
"grad_norm": 0.7983847311346082, |
|
"learning_rate": 9.708007224563517e-06, |
|
"loss": 0.7026, |
|
"mean_token_accuracy": 0.8072569638490676, |
|
"num_tokens": 223704579.0, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.516255267910897, |
|
"grad_norm": 0.7960686997526522, |
|
"learning_rate": 9.677904876580374e-06, |
|
"loss": 0.7221, |
|
"mean_token_accuracy": 0.8030448406934738, |
|
"num_tokens": 224359850.0, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.5177603853100542, |
|
"grad_norm": 0.8914834941832228, |
|
"learning_rate": 9.647802528597231e-06, |
|
"loss": 0.7401, |
|
"mean_token_accuracy": 0.7997129946947098, |
|
"num_tokens": 225015210.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.5192655027092113, |
|
"grad_norm": 0.8819413244629241, |
|
"learning_rate": 9.617700180614088e-06, |
|
"loss": 0.7101, |
|
"mean_token_accuracy": 0.806117182970047, |
|
"num_tokens": 225670570.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.5207706201083685, |
|
"grad_norm": 0.7500759430414385, |
|
"learning_rate": 9.587597832630946e-06, |
|
"loss": 0.7091, |
|
"mean_token_accuracy": 0.8057623341679573, |
|
"num_tokens": 226324513.0, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.5222757375075255, |
|
"grad_norm": 0.8674975027620144, |
|
"learning_rate": 9.557495484647803e-06, |
|
"loss": 0.7347, |
|
"mean_token_accuracy": 0.8012763857841492, |
|
"num_tokens": 226979111.0, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.5237808549066827, |
|
"grad_norm": 0.7763764417344521, |
|
"learning_rate": 9.527393136664662e-06, |
|
"loss": 0.6999, |
|
"mean_token_accuracy": 0.8071465358138085, |
|
"num_tokens": 227632704.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.5252859723058398, |
|
"grad_norm": 0.7645444118571866, |
|
"learning_rate": 9.497290788681517e-06, |
|
"loss": 0.7076, |
|
"mean_token_accuracy": 0.8069016903638839, |
|
"num_tokens": 228286371.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.526791089704997, |
|
"grad_norm": 0.7792334177211975, |
|
"learning_rate": 9.467188440698375e-06, |
|
"loss": 0.696, |
|
"mean_token_accuracy": 0.8077277734875679, |
|
"num_tokens": 228941731.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5282962071041541, |
|
"grad_norm": 0.7424501126051348, |
|
"learning_rate": 9.437086092715234e-06, |
|
"loss": 0.6985, |
|
"mean_token_accuracy": 0.8082936689257622, |
|
"num_tokens": 229595789.0, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.5298013245033113, |
|
"grad_norm": 0.7130346276764096, |
|
"learning_rate": 9.40698374473209e-06, |
|
"loss": 0.7345, |
|
"mean_token_accuracy": 0.8005663812160492, |
|
"num_tokens": 230251149.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.5313064419024683, |
|
"grad_norm": 0.8376943633378543, |
|
"learning_rate": 9.376881396748946e-06, |
|
"loss": 0.7263, |
|
"mean_token_accuracy": 0.8031929656863213, |
|
"num_tokens": 230902981.0, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.5328115593016255, |
|
"grad_norm": 0.7293887942764449, |
|
"learning_rate": 9.346779048765805e-06, |
|
"loss": 0.7432, |
|
"mean_token_accuracy": 0.7978022322058678, |
|
"num_tokens": 231557254.0, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.5343166767007826, |
|
"grad_norm": 0.7222945165317631, |
|
"learning_rate": 9.316676700782663e-06, |
|
"loss": 0.7092, |
|
"mean_token_accuracy": 0.8053798228502274, |
|
"num_tokens": 232212614.0, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.5358217940999398, |
|
"grad_norm": 0.8062132119932597, |
|
"learning_rate": 9.286574352799518e-06, |
|
"loss": 0.6995, |
|
"mean_token_accuracy": 0.8085808470845223, |
|
"num_tokens": 232866874.0, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.537326911499097, |
|
"grad_norm": 0.8382257521288697, |
|
"learning_rate": 9.256472004816377e-06, |
|
"loss": 0.716, |
|
"mean_token_accuracy": 0.805452823638916, |
|
"num_tokens": 233522182.0, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.5388320288982541, |
|
"grad_norm": 0.9108274302981122, |
|
"learning_rate": 9.226369656833234e-06, |
|
"loss": 0.7359, |
|
"mean_token_accuracy": 0.799033597111702, |
|
"num_tokens": 234175081.0, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.5403371462974113, |
|
"grad_norm": 0.7577515749647386, |
|
"learning_rate": 9.196267308850092e-06, |
|
"loss": 0.7382, |
|
"mean_token_accuracy": 0.8003885015845299, |
|
"num_tokens": 234830286.0, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.5418422636965683, |
|
"grad_norm": 0.8170263948595179, |
|
"learning_rate": 9.166164960866947e-06, |
|
"loss": 0.7093, |
|
"mean_token_accuracy": 0.8067599460482597, |
|
"num_tokens": 235482479.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5433473810957254, |
|
"grad_norm": 0.8089345629962644, |
|
"learning_rate": 9.136062612883806e-06, |
|
"loss": 0.7013, |
|
"mean_token_accuracy": 0.8066691935062409, |
|
"num_tokens": 236134271.0, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.5448524984948826, |
|
"grad_norm": 0.8096489243491711, |
|
"learning_rate": 9.105960264900663e-06, |
|
"loss": 0.671, |
|
"mean_token_accuracy": 0.8146300122141839, |
|
"num_tokens": 236788343.0, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.5463576158940397, |
|
"grad_norm": 0.7712590758344209, |
|
"learning_rate": 9.07585791691752e-06, |
|
"loss": 0.6936, |
|
"mean_token_accuracy": 0.8090818926692009, |
|
"num_tokens": 237443703.0, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.5478627332931969, |
|
"grad_norm": 0.7331289652614967, |
|
"learning_rate": 9.045755568934378e-06, |
|
"loss": 0.6879, |
|
"mean_token_accuracy": 0.8104536637663842, |
|
"num_tokens": 238097543.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.549367850692354, |
|
"grad_norm": 0.7894029957389098, |
|
"learning_rate": 9.015653220951235e-06, |
|
"loss": 0.746, |
|
"mean_token_accuracy": 0.7971894860267639, |
|
"num_tokens": 238752903.0, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.5508729680915111, |
|
"grad_norm": 0.7337667549435172, |
|
"learning_rate": 8.985550872968092e-06, |
|
"loss": 0.6906, |
|
"mean_token_accuracy": 0.8105040520429612, |
|
"num_tokens": 239407909.0, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.5523780854906682, |
|
"grad_norm": 0.7114551170726308, |
|
"learning_rate": 8.95544852498495e-06, |
|
"loss": 0.7012, |
|
"mean_token_accuracy": 0.8078293934464454, |
|
"num_tokens": 240062694.0, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.5538832028898254, |
|
"grad_norm": 0.8053101136590101, |
|
"learning_rate": 8.925346177001807e-06, |
|
"loss": 0.7128, |
|
"mean_token_accuracy": 0.8046775788068772, |
|
"num_tokens": 240718054.0, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.5553883202889826, |
|
"grad_norm": 0.7743261506938821, |
|
"learning_rate": 8.895243829018664e-06, |
|
"loss": 0.6771, |
|
"mean_token_accuracy": 0.8148664027452469, |
|
"num_tokens": 241372573.0, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.5568934376881397, |
|
"grad_norm": 0.7698400139152973, |
|
"learning_rate": 8.865141481035521e-06, |
|
"loss": 0.7031, |
|
"mean_token_accuracy": 0.8072331473231316, |
|
"num_tokens": 242027933.0, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.5583985550872969, |
|
"grad_norm": 0.8364424915545474, |
|
"learning_rate": 8.835039133052378e-06, |
|
"loss": 0.7197, |
|
"mean_token_accuracy": 0.8043941155076026, |
|
"num_tokens": 242682455.0, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.5599036724864539, |
|
"grad_norm": 0.7810243244440305, |
|
"learning_rate": 8.804936785069236e-06, |
|
"loss": 0.7323, |
|
"mean_token_accuracy": 0.800259268283844, |
|
"num_tokens": 243337005.0, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.561408789885611, |
|
"grad_norm": 0.7361666809162144, |
|
"learning_rate": 8.774834437086095e-06, |
|
"loss": 0.6941, |
|
"mean_token_accuracy": 0.8084026902914048, |
|
"num_tokens": 243990915.0, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.5629139072847682, |
|
"grad_norm": 0.7513727479514392, |
|
"learning_rate": 8.74473208910295e-06, |
|
"loss": 0.6985, |
|
"mean_token_accuracy": 0.8086724117398262, |
|
"num_tokens": 244645790.0, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.5644190246839254, |
|
"grad_norm": 0.8119539552530328, |
|
"learning_rate": 8.714629741119807e-06, |
|
"loss": 0.7099, |
|
"mean_token_accuracy": 0.806215389072895, |
|
"num_tokens": 245300587.0, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.5659241420830825, |
|
"grad_norm": 0.8230131090315613, |
|
"learning_rate": 8.684527393136666e-06, |
|
"loss": 0.6961, |
|
"mean_token_accuracy": 0.807398022711277, |
|
"num_tokens": 245955947.0, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.5674292594822397, |
|
"grad_norm": 0.8657958087347106, |
|
"learning_rate": 8.654425045153522e-06, |
|
"loss": 0.6862, |
|
"mean_token_accuracy": 0.8127268105745316, |
|
"num_tokens": 246608889.0, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.5689343768813967, |
|
"grad_norm": 0.8509981229148196, |
|
"learning_rate": 8.62432269717038e-06, |
|
"loss": 0.7104, |
|
"mean_token_accuracy": 0.8052561670541764, |
|
"num_tokens": 247264249.0, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.5704394942805538, |
|
"grad_norm": 0.7620198197373849, |
|
"learning_rate": 8.594220349187238e-06, |
|
"loss": 0.7051, |
|
"mean_token_accuracy": 0.8070444941520691, |
|
"num_tokens": 247918973.0, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.571944611679711, |
|
"grad_norm": 0.7589070350681284, |
|
"learning_rate": 8.564118001204095e-06, |
|
"loss": 0.7077, |
|
"mean_token_accuracy": 0.8067975386977195, |
|
"num_tokens": 248573625.0, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.5734497290788682, |
|
"grad_norm": 0.8182641750295068, |
|
"learning_rate": 8.534015653220951e-06, |
|
"loss": 0.6953, |
|
"mean_token_accuracy": 0.8103018119931221, |
|
"num_tokens": 249226405.0, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.5749548464780253, |
|
"grad_norm": 0.8830083523333203, |
|
"learning_rate": 8.50391330523781e-06, |
|
"loss": 0.7299, |
|
"mean_token_accuracy": 0.8012143760919571, |
|
"num_tokens": 249880176.0, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.5764599638771825, |
|
"grad_norm": 0.8146412671634994, |
|
"learning_rate": 8.473810957254667e-06, |
|
"loss": 0.6632, |
|
"mean_token_accuracy": 0.8136916980147362, |
|
"num_tokens": 250532586.0, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.5779650812763396, |
|
"grad_norm": 0.8528906066399552, |
|
"learning_rate": 8.443708609271524e-06, |
|
"loss": 0.6806, |
|
"mean_token_accuracy": 0.8144933164119721, |
|
"num_tokens": 251186836.0, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.5794701986754967, |
|
"grad_norm": 0.8330221415506783, |
|
"learning_rate": 8.41360626128838e-06, |
|
"loss": 0.7665, |
|
"mean_token_accuracy": 0.7930653065443038, |
|
"num_tokens": 251840226.0, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.5809753160746538, |
|
"grad_norm": 0.8745918315876232, |
|
"learning_rate": 8.383503913305239e-06, |
|
"loss": 0.709, |
|
"mean_token_accuracy": 0.8056306362152099, |
|
"num_tokens": 252494390.0, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.582480433473811, |
|
"grad_norm": 0.9080680717665254, |
|
"learning_rate": 8.353401565322096e-06, |
|
"loss": 0.7128, |
|
"mean_token_accuracy": 0.8059083595871925, |
|
"num_tokens": 253147168.0, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.5839855508729681, |
|
"grad_norm": 0.9177323805657555, |
|
"learning_rate": 8.323299217338953e-06, |
|
"loss": 0.7202, |
|
"mean_token_accuracy": 0.8017226129770278, |
|
"num_tokens": 253801671.0, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.5854906682721253, |
|
"grad_norm": 0.7990780628042131, |
|
"learning_rate": 8.29319686935581e-06, |
|
"loss": 0.6931, |
|
"mean_token_accuracy": 0.8101041629910469, |
|
"num_tokens": 254455490.0, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.5869957856712824, |
|
"grad_norm": 0.8161176055240229, |
|
"learning_rate": 8.263094521372668e-06, |
|
"loss": 0.7182, |
|
"mean_token_accuracy": 0.8037081718444824, |
|
"num_tokens": 255110850.0, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.5885009030704395, |
|
"grad_norm": 0.7603871762926178, |
|
"learning_rate": 8.232992173389525e-06, |
|
"loss": 0.7476, |
|
"mean_token_accuracy": 0.7982825547456741, |
|
"num_tokens": 255764719.0, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.5900060204695966, |
|
"grad_norm": 0.7820111562993072, |
|
"learning_rate": 8.202889825406382e-06, |
|
"loss": 0.7109, |
|
"mean_token_accuracy": 0.806066806614399, |
|
"num_tokens": 256420079.0, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.5915111378687538, |
|
"grad_norm": 0.7059140890336615, |
|
"learning_rate": 8.17278747742324e-06, |
|
"loss": 0.7196, |
|
"mean_token_accuracy": 0.8032822415232659, |
|
"num_tokens": 257075439.0, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.5930162552679109, |
|
"grad_norm": 0.7949546362150071, |
|
"learning_rate": 8.142685129440097e-06, |
|
"loss": 0.7478, |
|
"mean_token_accuracy": 0.797232711315155, |
|
"num_tokens": 257727872.0, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.5945213726670681, |
|
"grad_norm": 0.7485881361967065, |
|
"learning_rate": 8.112582781456954e-06, |
|
"loss": 0.687, |
|
"mean_token_accuracy": 0.8098583161830902, |
|
"num_tokens": 258381731.0, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.5960264900662252, |
|
"grad_norm": 0.7946141713531891, |
|
"learning_rate": 8.082480433473811e-06, |
|
"loss": 0.7007, |
|
"mean_token_accuracy": 0.8085067048668861, |
|
"num_tokens": 259035946.0, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.5975316074653823, |
|
"grad_norm": 0.8230729638127956, |
|
"learning_rate": 8.052378085490669e-06, |
|
"loss": 0.7079, |
|
"mean_token_accuracy": 0.8059585183858872, |
|
"num_tokens": 259689357.0, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.5990367248645394, |
|
"grad_norm": 0.8325239572087008, |
|
"learning_rate": 8.022275737507526e-06, |
|
"loss": 0.7097, |
|
"mean_token_accuracy": 0.8045219138264657, |
|
"num_tokens": 260343427.0, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.6005418422636966, |
|
"grad_norm": 0.8434092525407488, |
|
"learning_rate": 7.992173389524383e-06, |
|
"loss": 0.7327, |
|
"mean_token_accuracy": 0.8014701396226883, |
|
"num_tokens": 260998787.0, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.6020469596628537, |
|
"grad_norm": 0.7986843262365315, |
|
"learning_rate": 7.96207104154124e-06, |
|
"loss": 0.6969, |
|
"mean_token_accuracy": 0.8094117864966393, |
|
"num_tokens": 261652843.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6035520770620109, |
|
"grad_norm": 0.942507818939871, |
|
"learning_rate": 7.9319686935581e-06, |
|
"loss": 0.7051, |
|
"mean_token_accuracy": 0.807870452105999, |
|
"num_tokens": 262307531.0, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.605057194461168, |
|
"grad_norm": 0.7691027289067521, |
|
"learning_rate": 7.901866345574955e-06, |
|
"loss": 0.6997, |
|
"mean_token_accuracy": 0.8096072554588318, |
|
"num_tokens": 262959593.0, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.606562311860325, |
|
"grad_norm": 0.7411382069273657, |
|
"learning_rate": 7.871763997591812e-06, |
|
"loss": 0.6795, |
|
"mean_token_accuracy": 0.8132504016160965, |
|
"num_tokens": 263613258.0, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.6080674292594822, |
|
"grad_norm": 0.7606334503663869, |
|
"learning_rate": 7.841661649608671e-06, |
|
"loss": 0.71, |
|
"mean_token_accuracy": 0.8049027249217033, |
|
"num_tokens": 264267363.0, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.6095725466586394, |
|
"grad_norm": 0.740979735281585, |
|
"learning_rate": 7.811559301625528e-06, |
|
"loss": 0.6776, |
|
"mean_token_accuracy": 0.811868742108345, |
|
"num_tokens": 264919791.0, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.6110776640577965, |
|
"grad_norm": 0.7287167342458135, |
|
"learning_rate": 7.781456953642384e-06, |
|
"loss": 0.6714, |
|
"mean_token_accuracy": 0.8140159368515014, |
|
"num_tokens": 265575151.0, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.6125827814569537, |
|
"grad_norm": 0.769291891701047, |
|
"learning_rate": 7.751354605659243e-06, |
|
"loss": 0.6741, |
|
"mean_token_accuracy": 0.8136517152190208, |
|
"num_tokens": 266229797.0, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.6140878988561108, |
|
"grad_norm": 0.8750720681891121, |
|
"learning_rate": 7.7212522576761e-06, |
|
"loss": 0.662, |
|
"mean_token_accuracy": 0.8171534687280655, |
|
"num_tokens": 266877923.0, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.615593016255268, |
|
"grad_norm": 0.7883482239875798, |
|
"learning_rate": 7.691149909692957e-06, |
|
"loss": 0.6787, |
|
"mean_token_accuracy": 0.8132388830184937, |
|
"num_tokens": 267533283.0, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.617098133654425, |
|
"grad_norm": 0.8284914998217778, |
|
"learning_rate": 7.661047561709813e-06, |
|
"loss": 0.7326, |
|
"mean_token_accuracy": 0.8027213707566261, |
|
"num_tokens": 268187229.0, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6186032510535822, |
|
"grad_norm": 0.7764151172700101, |
|
"learning_rate": 7.630945213726672e-06, |
|
"loss": 0.7139, |
|
"mean_token_accuracy": 0.8039753317832947, |
|
"num_tokens": 268842589.0, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.6201083684527393, |
|
"grad_norm": 0.8181806298737985, |
|
"learning_rate": 7.600842865743528e-06, |
|
"loss": 0.6847, |
|
"mean_token_accuracy": 0.810582558810711, |
|
"num_tokens": 269497949.0, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.6216134858518965, |
|
"grad_norm": 0.7990406941932666, |
|
"learning_rate": 7.570740517760385e-06, |
|
"loss": 0.6725, |
|
"mean_token_accuracy": 0.8125614732503891, |
|
"num_tokens": 270152948.0, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.6231186032510536, |
|
"grad_norm": 0.8679862562642979, |
|
"learning_rate": 7.5406381697772435e-06, |
|
"loss": 0.7086, |
|
"mean_token_accuracy": 0.806426303088665, |
|
"num_tokens": 270808241.0, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.6246237206502108, |
|
"grad_norm": 0.9042351682289728, |
|
"learning_rate": 7.510535821794101e-06, |
|
"loss": 0.7317, |
|
"mean_token_accuracy": 0.8006155714392662, |
|
"num_tokens": 271461947.0, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.6261288380493678, |
|
"grad_norm": 0.8428199817462217, |
|
"learning_rate": 7.480433473810957e-06, |
|
"loss": 0.6875, |
|
"mean_token_accuracy": 0.8104069977998734, |
|
"num_tokens": 272117307.0, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.627633955448525, |
|
"grad_norm": 0.7636935493635175, |
|
"learning_rate": 7.450331125827815e-06, |
|
"loss": 0.711, |
|
"mean_token_accuracy": 0.8058418169617653, |
|
"num_tokens": 272771056.0, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.6291390728476821, |
|
"grad_norm": 0.7308292189453819, |
|
"learning_rate": 7.4202287778446725e-06, |
|
"loss": 0.6874, |
|
"mean_token_accuracy": 0.8109762862324714, |
|
"num_tokens": 273425542.0, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.6306441902468393, |
|
"grad_norm": 0.7974301033969676, |
|
"learning_rate": 7.39012642986153e-06, |
|
"loss": 0.7122, |
|
"mean_token_accuracy": 0.8060316890478134, |
|
"num_tokens": 274080902.0, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.6321493076459964, |
|
"grad_norm": 0.8281909503451766, |
|
"learning_rate": 7.360024081878388e-06, |
|
"loss": 0.6789, |
|
"mean_token_accuracy": 0.8130032166838645, |
|
"num_tokens": 274734247.0, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6336544250451536, |
|
"grad_norm": 0.8469223277668484, |
|
"learning_rate": 7.329921733895244e-06, |
|
"loss": 0.7208, |
|
"mean_token_accuracy": 0.8034349054098129, |
|
"num_tokens": 275389607.0, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.6351595424443106, |
|
"grad_norm": 0.8659132521429633, |
|
"learning_rate": 7.2998193859121015e-06, |
|
"loss": 0.7191, |
|
"mean_token_accuracy": 0.8031769096851349, |
|
"num_tokens": 276044967.0, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.6366646598434678, |
|
"grad_norm": 0.8420814798005133, |
|
"learning_rate": 7.2697170379289596e-06, |
|
"loss": 0.7292, |
|
"mean_token_accuracy": 0.8008877292275429, |
|
"num_tokens": 276699487.0, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.6381697772426249, |
|
"grad_norm": 0.8518963463964698, |
|
"learning_rate": 7.239614689945817e-06, |
|
"loss": 0.7155, |
|
"mean_token_accuracy": 0.8045470133423805, |
|
"num_tokens": 277353875.0, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.6396748946417821, |
|
"grad_norm": 0.8352911877693019, |
|
"learning_rate": 7.209512341962673e-06, |
|
"loss": 0.668, |
|
"mean_token_accuracy": 0.8157885015010834, |
|
"num_tokens": 278007391.0, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.6411800120409392, |
|
"grad_norm": 0.7890199230141322, |
|
"learning_rate": 7.179409993979531e-06, |
|
"loss": 0.7163, |
|
"mean_token_accuracy": 0.8045294061303139, |
|
"num_tokens": 278660967.0, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.6426851294400964, |
|
"grad_norm": 0.7907907383344028, |
|
"learning_rate": 7.1493076459963886e-06, |
|
"loss": 0.6951, |
|
"mean_token_accuracy": 0.8099789813160896, |
|
"num_tokens": 279315319.0, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.6441902468392534, |
|
"grad_norm": 0.7752324901881567, |
|
"learning_rate": 7.119205298013245e-06, |
|
"loss": 0.6796, |
|
"mean_token_accuracy": 0.8136557549238205, |
|
"num_tokens": 279970002.0, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.6456953642384106, |
|
"grad_norm": 0.7904244583768335, |
|
"learning_rate": 7.089102950030103e-06, |
|
"loss": 0.6932, |
|
"mean_token_accuracy": 0.8082010626792908, |
|
"num_tokens": 280621981.0, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.6472004816375677, |
|
"grad_norm": 0.7571107948166566, |
|
"learning_rate": 7.05900060204696e-06, |
|
"loss": 0.6714, |
|
"mean_token_accuracy": 0.8147674828767777, |
|
"num_tokens": 281275796.0, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.6487055990367249, |
|
"grad_norm": 0.7519819004545625, |
|
"learning_rate": 7.0288982540638175e-06, |
|
"loss": 0.6843, |
|
"mean_token_accuracy": 0.8124752193689346, |
|
"num_tokens": 281930076.0, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.650210716435882, |
|
"grad_norm": 0.8136271130646697, |
|
"learning_rate": 6.998795906080676e-06, |
|
"loss": 0.6902, |
|
"mean_token_accuracy": 0.8109596386551857, |
|
"num_tokens": 282585436.0, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.6517158338350392, |
|
"grad_norm": 0.7542024433510576, |
|
"learning_rate": 6.968693558097532e-06, |
|
"loss": 0.6853, |
|
"mean_token_accuracy": 0.8105558544397354, |
|
"num_tokens": 283239814.0, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.6532209512341962, |
|
"grad_norm": 0.8232441623305137, |
|
"learning_rate": 6.938591210114389e-06, |
|
"loss": 0.6962, |
|
"mean_token_accuracy": 0.8088379830121994, |
|
"num_tokens": 283893844.0, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.6547260686333534, |
|
"grad_norm": 0.8848354735677997, |
|
"learning_rate": 6.9084888621312465e-06, |
|
"loss": 0.7614, |
|
"mean_token_accuracy": 0.7953187227249146, |
|
"num_tokens": 284548116.0, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.6562311860325105, |
|
"grad_norm": 0.7946687427901056, |
|
"learning_rate": 6.878386514148105e-06, |
|
"loss": 0.6849, |
|
"mean_token_accuracy": 0.809443698823452, |
|
"num_tokens": 285203476.0, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.6577363034316677, |
|
"grad_norm": 0.8589342176440701, |
|
"learning_rate": 6.848284166164961e-06, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.8002122029662132, |
|
"num_tokens": 285858836.0, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.6592414208308248, |
|
"grad_norm": 0.7808127559059683, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 0.6558, |
|
"mean_token_accuracy": 0.8172375440597535, |
|
"num_tokens": 286513731.0, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.660746538229982, |
|
"grad_norm": 0.7929713144978754, |
|
"learning_rate": 6.788079470198676e-06, |
|
"loss": 0.6725, |
|
"mean_token_accuracy": 0.8145379722118378, |
|
"num_tokens": 287166163.0, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.6622516556291391, |
|
"grad_norm": 0.7853169035829088, |
|
"learning_rate": 6.757977122215534e-06, |
|
"loss": 0.7032, |
|
"mean_token_accuracy": 0.8071265637874603, |
|
"num_tokens": 287819834.0, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.6637567730282962, |
|
"grad_norm": 0.7581464532781553, |
|
"learning_rate": 6.72787477423239e-06, |
|
"loss": 0.6989, |
|
"mean_token_accuracy": 0.8081767991185188, |
|
"num_tokens": 288474034.0, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.6652618904274533, |
|
"grad_norm": 0.815968078478177, |
|
"learning_rate": 6.697772426249248e-06, |
|
"loss": 0.6781, |
|
"mean_token_accuracy": 0.8142720222473144, |
|
"num_tokens": 289128588.0, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.6667670078266105, |
|
"grad_norm": 0.6938381862941432, |
|
"learning_rate": 6.667670078266105e-06, |
|
"loss": 0.6601, |
|
"mean_token_accuracy": 0.8173514276742935, |
|
"num_tokens": 289782786.0, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.6682721252257676, |
|
"grad_norm": 0.732123530660249, |
|
"learning_rate": 6.637567730282963e-06, |
|
"loss": 0.7348, |
|
"mean_token_accuracy": 0.8012821659445762, |
|
"num_tokens": 290437574.0, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.6697772426249248, |
|
"grad_norm": 0.8281427123488975, |
|
"learning_rate": 6.60746538229982e-06, |
|
"loss": 0.6719, |
|
"mean_token_accuracy": 0.813536812365055, |
|
"num_tokens": 291091267.0, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.6712823600240819, |
|
"grad_norm": 0.779706260867014, |
|
"learning_rate": 6.577363034316677e-06, |
|
"loss": 0.6524, |
|
"mean_token_accuracy": 0.8184206783771515, |
|
"num_tokens": 291745182.0, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.672787477423239, |
|
"grad_norm": 0.8908264453869105, |
|
"learning_rate": 6.547260686333534e-06, |
|
"loss": 0.7097, |
|
"mean_token_accuracy": 0.8062851145863533, |
|
"num_tokens": 292400542.0, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.6742925948223961, |
|
"grad_norm": 0.7475757803731065, |
|
"learning_rate": 6.5171583383503924e-06, |
|
"loss": 0.7158, |
|
"mean_token_accuracy": 0.8046679839491844, |
|
"num_tokens": 293053114.0, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.6757977122215533, |
|
"grad_norm": 0.7390432299118334, |
|
"learning_rate": 6.487055990367249e-06, |
|
"loss": 0.674, |
|
"mean_token_accuracy": 0.8137899979948997, |
|
"num_tokens": 293708474.0, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.6773028296207104, |
|
"grad_norm": 0.803312400596195, |
|
"learning_rate": 6.456953642384106e-06, |
|
"loss": 0.7021, |
|
"mean_token_accuracy": 0.8084595799446106, |
|
"num_tokens": 294361973.0, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6788079470198676, |
|
"grad_norm": 0.7831651665240028, |
|
"learning_rate": 6.426851294400964e-06, |
|
"loss": 0.669, |
|
"mean_token_accuracy": 0.8149545326828956, |
|
"num_tokens": 295015304.0, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.6803130644190247, |
|
"grad_norm": 0.7698329146574737, |
|
"learning_rate": 6.3967489464178214e-06, |
|
"loss": 0.6704, |
|
"mean_token_accuracy": 0.8154127985239029, |
|
"num_tokens": 295670664.0, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.7685548489665934, |
|
"learning_rate": 6.366646598434678e-06, |
|
"loss": 0.6691, |
|
"mean_token_accuracy": 0.8155076310038567, |
|
"num_tokens": 296324909.0, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.6833232992173389, |
|
"grad_norm": 0.953070332907545, |
|
"learning_rate": 6.336544250451536e-06, |
|
"loss": 0.7428, |
|
"mean_token_accuracy": 0.7991679951548576, |
|
"num_tokens": 296980269.0, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.6848284166164961, |
|
"grad_norm": 0.8190164713090095, |
|
"learning_rate": 6.306441902468393e-06, |
|
"loss": 0.678, |
|
"mean_token_accuracy": 0.8134281873703003, |
|
"num_tokens": 297635629.0, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.6863335340156532, |
|
"grad_norm": 0.7992478641110825, |
|
"learning_rate": 6.27633955448525e-06, |
|
"loss": 0.6584, |
|
"mean_token_accuracy": 0.8175561770796775, |
|
"num_tokens": 298290989.0, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.6878386514148104, |
|
"grad_norm": 0.755402637130093, |
|
"learning_rate": 6.2462372065021085e-06, |
|
"loss": 0.6935, |
|
"mean_token_accuracy": 0.8109642148017884, |
|
"num_tokens": 298946349.0, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.6893437688139675, |
|
"grad_norm": 0.8090465130369789, |
|
"learning_rate": 6.216134858518965e-06, |
|
"loss": 0.6756, |
|
"mean_token_accuracy": 0.8144601851701736, |
|
"num_tokens": 299601709.0, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.6908488862131246, |
|
"grad_norm": 0.7051718017563176, |
|
"learning_rate": 6.186032510535822e-06, |
|
"loss": 0.6944, |
|
"mean_token_accuracy": 0.8100238159298897, |
|
"num_tokens": 300256713.0, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.6923540036122817, |
|
"grad_norm": 0.8023442611978678, |
|
"learning_rate": 6.15593016255268e-06, |
|
"loss": 0.7102, |
|
"mean_token_accuracy": 0.8061564579606056, |
|
"num_tokens": 300910315.0, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.6938591210114389, |
|
"grad_norm": 0.6592248864741374, |
|
"learning_rate": 6.1258278145695375e-06, |
|
"loss": 0.6394, |
|
"mean_token_accuracy": 0.8232551902532578, |
|
"num_tokens": 301563354.0, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.695364238410596, |
|
"grad_norm": 0.8216711999766739, |
|
"learning_rate": 6.095725466586394e-06, |
|
"loss": 0.7033, |
|
"mean_token_accuracy": 0.8067058518528938, |
|
"num_tokens": 302218621.0, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.6968693558097532, |
|
"grad_norm": 0.7450543082358431, |
|
"learning_rate": 6.065623118603251e-06, |
|
"loss": 0.6414, |
|
"mean_token_accuracy": 0.8213235855102539, |
|
"num_tokens": 302872247.0, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.6983744732089103, |
|
"grad_norm": 0.7184327185317253, |
|
"learning_rate": 6.035520770620109e-06, |
|
"loss": 0.6882, |
|
"mean_token_accuracy": 0.8115840300917625, |
|
"num_tokens": 303527607.0, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.6998795906080675, |
|
"grad_norm": 0.6945338421799874, |
|
"learning_rate": 6.005418422636966e-06, |
|
"loss": 0.6713, |
|
"mean_token_accuracy": 0.81469986140728, |
|
"num_tokens": 304182967.0, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.7013847080072245, |
|
"grad_norm": 0.840530737970437, |
|
"learning_rate": 5.975316074653823e-06, |
|
"loss": 0.7247, |
|
"mean_token_accuracy": 0.802819675207138, |
|
"num_tokens": 304838327.0, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.7028898254063817, |
|
"grad_norm": 0.7290847678467367, |
|
"learning_rate": 5.945213726670681e-06, |
|
"loss": 0.6556, |
|
"mean_token_accuracy": 0.8193713694810867, |
|
"num_tokens": 305492953.0, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.7043949428055388, |
|
"grad_norm": 0.7877331459001613, |
|
"learning_rate": 5.915111378687538e-06, |
|
"loss": 0.6846, |
|
"mean_token_accuracy": 0.8122801646590233, |
|
"num_tokens": 306148313.0, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.705900060204696, |
|
"grad_norm": 0.7764906463226239, |
|
"learning_rate": 5.885009030704395e-06, |
|
"loss": 0.6865, |
|
"mean_token_accuracy": 0.810577142238617, |
|
"num_tokens": 306801862.0, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.7074051776038531, |
|
"grad_norm": 0.7484928423002017, |
|
"learning_rate": 5.854906682721253e-06, |
|
"loss": 0.6511, |
|
"mean_token_accuracy": 0.818084391951561, |
|
"num_tokens": 307457222.0, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7089102950030103, |
|
"grad_norm": 0.8535454844390912, |
|
"learning_rate": 5.82480433473811e-06, |
|
"loss": 0.6695, |
|
"mean_token_accuracy": 0.814570102095604, |
|
"num_tokens": 308112582.0, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.7104154124021673, |
|
"grad_norm": 0.835114125014516, |
|
"learning_rate": 5.794701986754967e-06, |
|
"loss": 0.7017, |
|
"mean_token_accuracy": 0.8075006246566773, |
|
"num_tokens": 308765597.0, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.7119205298013245, |
|
"grad_norm": 0.8008764952393119, |
|
"learning_rate": 5.764599638771825e-06, |
|
"loss": 0.6593, |
|
"mean_token_accuracy": 0.8166110992431641, |
|
"num_tokens": 309420016.0, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.7134256472004816, |
|
"grad_norm": 0.7196592850104424, |
|
"learning_rate": 5.734497290788682e-06, |
|
"loss": 0.6586, |
|
"mean_token_accuracy": 0.8172907829284668, |
|
"num_tokens": 310073088.0, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.7149307645996388, |
|
"grad_norm": 0.7935183204776026, |
|
"learning_rate": 5.704394942805539e-06, |
|
"loss": 0.7041, |
|
"mean_token_accuracy": 0.8055432423949241, |
|
"num_tokens": 310727014.0, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.7164358819987959, |
|
"grad_norm": 0.7208961075633612, |
|
"learning_rate": 5.674292594822397e-06, |
|
"loss": 0.7052, |
|
"mean_token_accuracy": 0.8065466269850731, |
|
"num_tokens": 311382141.0, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.7179409993979531, |
|
"grad_norm": 0.8241431117457545, |
|
"learning_rate": 5.644190246839254e-06, |
|
"loss": 0.7042, |
|
"mean_token_accuracy": 0.8074371844530106, |
|
"num_tokens": 312034916.0, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.7194461167971101, |
|
"grad_norm": 0.7811410686185015, |
|
"learning_rate": 5.614087898856111e-06, |
|
"loss": 0.6767, |
|
"mean_token_accuracy": 0.8120527639985085, |
|
"num_tokens": 312689580.0, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.7209512341962673, |
|
"grad_norm": 0.7065153170908021, |
|
"learning_rate": 5.583985550872969e-06, |
|
"loss": 0.7057, |
|
"mean_token_accuracy": 0.8083510205149651, |
|
"num_tokens": 313342728.0, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.7224563515954244, |
|
"grad_norm": 0.8288220684385291, |
|
"learning_rate": 5.553883202889826e-06, |
|
"loss": 0.7298, |
|
"mean_token_accuracy": 0.8011138662695885, |
|
"num_tokens": 313995093.0, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7239614689945816, |
|
"grad_norm": 0.7676652284446706, |
|
"learning_rate": 5.523780854906683e-06, |
|
"loss": 0.6931, |
|
"mean_token_accuracy": 0.811854538321495, |
|
"num_tokens": 314648881.0, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.7254665863937387, |
|
"grad_norm": 0.7885991111534025, |
|
"learning_rate": 5.493678506923541e-06, |
|
"loss": 0.6705, |
|
"mean_token_accuracy": 0.8141166970133782, |
|
"num_tokens": 315304241.0, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.7269717037928959, |
|
"grad_norm": 0.8161757673596582, |
|
"learning_rate": 5.463576158940398e-06, |
|
"loss": 0.6762, |
|
"mean_token_accuracy": 0.814816965162754, |
|
"num_tokens": 315958535.0, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.7284768211920529, |
|
"grad_norm": 0.7968418972928049, |
|
"learning_rate": 5.433473810957255e-06, |
|
"loss": 0.6881, |
|
"mean_token_accuracy": 0.8102676823735238, |
|
"num_tokens": 316613446.0, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.7299819385912101, |
|
"grad_norm": 0.8092373608084555, |
|
"learning_rate": 5.403371462974113e-06, |
|
"loss": 0.6976, |
|
"mean_token_accuracy": 0.8103517308831215, |
|
"num_tokens": 317268325.0, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.7314870559903672, |
|
"grad_norm": 0.8801000450815974, |
|
"learning_rate": 5.3732691149909695e-06, |
|
"loss": 0.6517, |
|
"mean_token_accuracy": 0.819042882323265, |
|
"num_tokens": 317921988.0, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.7329921733895244, |
|
"grad_norm": 0.8380758783209445, |
|
"learning_rate": 5.343166767007827e-06, |
|
"loss": 0.6978, |
|
"mean_token_accuracy": 0.8100692957639695, |
|
"num_tokens": 318575553.0, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.7344972907886815, |
|
"grad_norm": 0.7584475333652385, |
|
"learning_rate": 5.313064419024684e-06, |
|
"loss": 0.6857, |
|
"mean_token_accuracy": 0.8126893028616905, |
|
"num_tokens": 319230913.0, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.7360024081878387, |
|
"grad_norm": 0.7631070843965636, |
|
"learning_rate": 5.282962071041542e-06, |
|
"loss": 0.6765, |
|
"mean_token_accuracy": 0.8138569176197052, |
|
"num_tokens": 319883805.0, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.7375075255869958, |
|
"grad_norm": 0.718262307466775, |
|
"learning_rate": 5.2528597230583985e-06, |
|
"loss": 0.6454, |
|
"mean_token_accuracy": 0.820010906457901, |
|
"num_tokens": 320538065.0, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.7390126429861529, |
|
"grad_norm": 0.7493033725192411, |
|
"learning_rate": 5.222757375075256e-06, |
|
"loss": 0.6806, |
|
"mean_token_accuracy": 0.8114639312028885, |
|
"num_tokens": 321193353.0, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.74051776038531, |
|
"grad_norm": 0.7653187902059273, |
|
"learning_rate": 5.192655027092114e-06, |
|
"loss": 0.7073, |
|
"mean_token_accuracy": 0.8058588966727257, |
|
"num_tokens": 321847074.0, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.7420228777844672, |
|
"grad_norm": 0.8171689434630046, |
|
"learning_rate": 5.162552679108971e-06, |
|
"loss": 0.6806, |
|
"mean_token_accuracy": 0.8125156402587891, |
|
"num_tokens": 322498780.0, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.7435279951836243, |
|
"grad_norm": 0.8542271561789201, |
|
"learning_rate": 5.1324503311258275e-06, |
|
"loss": 0.7199, |
|
"mean_token_accuracy": 0.8045185759663582, |
|
"num_tokens": 323151125.0, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.7450331125827815, |
|
"grad_norm": 0.7132356469162935, |
|
"learning_rate": 5.102347983142686e-06, |
|
"loss": 0.6765, |
|
"mean_token_accuracy": 0.8129563733935357, |
|
"num_tokens": 323806449.0, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.7465382299819386, |
|
"grad_norm": 0.7779913945784402, |
|
"learning_rate": 5.072245635159543e-06, |
|
"loss": 0.675, |
|
"mean_token_accuracy": 0.8126250460743905, |
|
"num_tokens": 324460173.0, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.7480433473810957, |
|
"grad_norm": 0.7915503234573145, |
|
"learning_rate": 5.0421432871764e-06, |
|
"loss": 0.7458, |
|
"mean_token_accuracy": 0.7976020961999893, |
|
"num_tokens": 325113568.0, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.7495484647802528, |
|
"grad_norm": 0.7769387021454827, |
|
"learning_rate": 5.012040939193258e-06, |
|
"loss": 0.6686, |
|
"mean_token_accuracy": 0.8162689179182052, |
|
"num_tokens": 325767952.0, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.75105358217941, |
|
"grad_norm": 0.8144124018793544, |
|
"learning_rate": 4.981938591210115e-06, |
|
"loss": 0.7011, |
|
"mean_token_accuracy": 0.8070224747061729, |
|
"num_tokens": 326423312.0, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.7525586995785671, |
|
"grad_norm": 0.7720856727719556, |
|
"learning_rate": 4.951836243226973e-06, |
|
"loss": 0.677, |
|
"mean_token_accuracy": 0.8127442598342896, |
|
"num_tokens": 327078672.0, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7540638169777243, |
|
"grad_norm": 0.8464803974508197, |
|
"learning_rate": 4.921733895243829e-06, |
|
"loss": 0.7111, |
|
"mean_token_accuracy": 0.8053447112441063, |
|
"num_tokens": 327734032.0, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.7555689343768814, |
|
"grad_norm": 0.8002088077987107, |
|
"learning_rate": 4.891631547260687e-06, |
|
"loss": 0.7123, |
|
"mean_token_accuracy": 0.8058378130197525, |
|
"num_tokens": 328389392.0, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.7570740517760385, |
|
"grad_norm": 0.7468644534166601, |
|
"learning_rate": 4.861529199277544e-06, |
|
"loss": 0.6742, |
|
"mean_token_accuracy": 0.8150235041975975, |
|
"num_tokens": 329044752.0, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.7585791691751956, |
|
"grad_norm": 0.8015537666197009, |
|
"learning_rate": 4.831426851294402e-06, |
|
"loss": 0.6862, |
|
"mean_token_accuracy": 0.811229458451271, |
|
"num_tokens": 329700020.0, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.7600842865743528, |
|
"grad_norm": 0.7838531070629737, |
|
"learning_rate": 4.801324503311259e-06, |
|
"loss": 0.6858, |
|
"mean_token_accuracy": 0.8106863215565682, |
|
"num_tokens": 330354156.0, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.7615894039735099, |
|
"grad_norm": 0.7591951467548631, |
|
"learning_rate": 4.771222155328115e-06, |
|
"loss": 0.7081, |
|
"mean_token_accuracy": 0.806295795738697, |
|
"num_tokens": 331009516.0, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.7630945213726671, |
|
"grad_norm": 0.7881977216420744, |
|
"learning_rate": 4.741119807344973e-06, |
|
"loss": 0.6688, |
|
"mean_token_accuracy": 0.8141529381275177, |
|
"num_tokens": 331664516.0, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.7645996387718242, |
|
"grad_norm": 0.8045258451588279, |
|
"learning_rate": 4.711017459361831e-06, |
|
"loss": 0.6753, |
|
"mean_token_accuracy": 0.8127610564231873, |
|
"num_tokens": 332319876.0, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.7661047561709813, |
|
"grad_norm": 0.7950453594601595, |
|
"learning_rate": 4.680915111378688e-06, |
|
"loss": 0.6449, |
|
"mean_token_accuracy": 0.8206124827265739, |
|
"num_tokens": 332975236.0, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.7676098735701384, |
|
"grad_norm": 0.83782481780607, |
|
"learning_rate": 4.650812763395545e-06, |
|
"loss": 0.6954, |
|
"mean_token_accuracy": 0.8100070223212242, |
|
"num_tokens": 333630596.0, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.7691149909692956, |
|
"grad_norm": 0.765461042500271, |
|
"learning_rate": 4.620710415412402e-06, |
|
"loss": 0.6878, |
|
"mean_token_accuracy": 0.8108726218342781, |
|
"num_tokens": 334285956.0, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.7706201083684527, |
|
"grad_norm": 0.8595361462765182, |
|
"learning_rate": 4.59060806742926e-06, |
|
"loss": 0.6743, |
|
"mean_token_accuracy": 0.8132791504263878, |
|
"num_tokens": 334940764.0, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.7721252257676099, |
|
"grad_norm": 0.7951576328256589, |
|
"learning_rate": 4.560505719446117e-06, |
|
"loss": 0.6788, |
|
"mean_token_accuracy": 0.8132970303297042, |
|
"num_tokens": 335592227.0, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.773630343166767, |
|
"grad_norm": 0.8314496227848391, |
|
"learning_rate": 4.530403371462975e-06, |
|
"loss": 0.6944, |
|
"mean_token_accuracy": 0.8085516512393951, |
|
"num_tokens": 336244385.0, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.7751354605659242, |
|
"grad_norm": 0.7849604988449984, |
|
"learning_rate": 4.500301023479831e-06, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.8182275414466857, |
|
"num_tokens": 336899083.0, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.7766405779650812, |
|
"grad_norm": 0.7905666003585319, |
|
"learning_rate": 4.4701986754966895e-06, |
|
"loss": 0.6735, |
|
"mean_token_accuracy": 0.8137266218662262, |
|
"num_tokens": 337553265.0, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.7781456953642384, |
|
"grad_norm": 0.795479794053426, |
|
"learning_rate": 4.440096327513547e-06, |
|
"loss": 0.6717, |
|
"mean_token_accuracy": 0.8137852057814599, |
|
"num_tokens": 338207675.0, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.7796508127633955, |
|
"grad_norm": 0.7289284088287575, |
|
"learning_rate": 4.409993979530404e-06, |
|
"loss": 0.652, |
|
"mean_token_accuracy": 0.8170636877417564, |
|
"num_tokens": 338861544.0, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.7811559301625527, |
|
"grad_norm": 0.7196535550987029, |
|
"learning_rate": 4.379891631547261e-06, |
|
"loss": 0.6521, |
|
"mean_token_accuracy": 0.8188467666506767, |
|
"num_tokens": 339516863.0, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.7826610475617098, |
|
"grad_norm": 0.7830400455502452, |
|
"learning_rate": 4.3497892835641185e-06, |
|
"loss": 0.7021, |
|
"mean_token_accuracy": 0.8078183531761169, |
|
"num_tokens": 340167656.0, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.784166164960867, |
|
"grad_norm": 0.7603191091855747, |
|
"learning_rate": 4.319686935580976e-06, |
|
"loss": 0.659, |
|
"mean_token_accuracy": 0.8158310890197754, |
|
"num_tokens": 340823016.0, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.785671282360024, |
|
"grad_norm": 0.7480631602536956, |
|
"learning_rate": 4.289584587597833e-06, |
|
"loss": 0.6682, |
|
"mean_token_accuracy": 0.8159776479005814, |
|
"num_tokens": 341478376.0, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.7871763997591812, |
|
"grad_norm": 0.7270903162865268, |
|
"learning_rate": 4.25948223961469e-06, |
|
"loss": 0.634, |
|
"mean_token_accuracy": 0.8241108819842339, |
|
"num_tokens": 342132186.0, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.7886815171583383, |
|
"grad_norm": 0.8122645136251377, |
|
"learning_rate": 4.2293798916315475e-06, |
|
"loss": 0.6751, |
|
"mean_token_accuracy": 0.8132022470235825, |
|
"num_tokens": 342787546.0, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.7901866345574955, |
|
"grad_norm": 0.7814481654035551, |
|
"learning_rate": 4.199277543648405e-06, |
|
"loss": 0.6769, |
|
"mean_token_accuracy": 0.8136052757501602, |
|
"num_tokens": 343442906.0, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.7916917519566526, |
|
"grad_norm": 0.7611291029949178, |
|
"learning_rate": 4.169175195665262e-06, |
|
"loss": 0.7009, |
|
"mean_token_accuracy": 0.8087088346481324, |
|
"num_tokens": 344096020.0, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.7931968693558098, |
|
"grad_norm": 0.7700160646970695, |
|
"learning_rate": 4.139072847682119e-06, |
|
"loss": 0.6854, |
|
"mean_token_accuracy": 0.8107923865318298, |
|
"num_tokens": 344749548.0, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.7947019867549668, |
|
"grad_norm": 0.7887413891550367, |
|
"learning_rate": 4.108970499698977e-06, |
|
"loss": 0.6733, |
|
"mean_token_accuracy": 0.8150668799877167, |
|
"num_tokens": 345402893.0, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.796207104154124, |
|
"grad_norm": 0.7829705708780775, |
|
"learning_rate": 4.078868151715834e-06, |
|
"loss": 0.6863, |
|
"mean_token_accuracy": 0.8105948135256767, |
|
"num_tokens": 346056278.0, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.7977122215532811, |
|
"grad_norm": 0.8031666958040956, |
|
"learning_rate": 4.048765803732692e-06, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.8193706855177879, |
|
"num_tokens": 346709619.0, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.7992173389524383, |
|
"grad_norm": 0.802978454866161, |
|
"learning_rate": 4.018663455749548e-06, |
|
"loss": 0.6909, |
|
"mean_token_accuracy": 0.8110115423798561, |
|
"num_tokens": 347364979.0, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.8007224563515954, |
|
"grad_norm": 0.6952179924863313, |
|
"learning_rate": 3.988561107766406e-06, |
|
"loss": 0.6595, |
|
"mean_token_accuracy": 0.8168508380651474, |
|
"num_tokens": 348019759.0, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.8022275737507526, |
|
"grad_norm": 0.7738986823200219, |
|
"learning_rate": 3.9584587597832635e-06, |
|
"loss": 0.6847, |
|
"mean_token_accuracy": 0.8109239622950554, |
|
"num_tokens": 348673508.0, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.8037326911499096, |
|
"grad_norm": 0.7412260862715365, |
|
"learning_rate": 3.928356411800121e-06, |
|
"loss": 0.695, |
|
"mean_token_accuracy": 0.8071655169129371, |
|
"num_tokens": 349327321.0, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.8052378085490668, |
|
"grad_norm": 0.8010096276041234, |
|
"learning_rate": 3.898254063816978e-06, |
|
"loss": 0.6978, |
|
"mean_token_accuracy": 0.8087977394461632, |
|
"num_tokens": 349982286.0, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.8067429259482239, |
|
"grad_norm": 0.8731170774854645, |
|
"learning_rate": 3.868151715833835e-06, |
|
"loss": 0.7326, |
|
"mean_token_accuracy": 0.8022829025983811, |
|
"num_tokens": 350634416.0, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.8082480433473811, |
|
"grad_norm": 0.7093730462307066, |
|
"learning_rate": 3.8380493678506925e-06, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.8181972727179527, |
|
"num_tokens": 351287751.0, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.8097531607465382, |
|
"grad_norm": 0.8137793008351559, |
|
"learning_rate": 3.80794701986755e-06, |
|
"loss": 0.6859, |
|
"mean_token_accuracy": 0.8102050065994263, |
|
"num_tokens": 351940648.0, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.8112582781456954, |
|
"grad_norm": 0.7904273712123535, |
|
"learning_rate": 3.7778446718844074e-06, |
|
"loss": 0.6795, |
|
"mean_token_accuracy": 0.8127697423100472, |
|
"num_tokens": 352594701.0, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.8127633955448526, |
|
"grad_norm": 0.8285781285803187, |
|
"learning_rate": 3.7477423239012643e-06, |
|
"loss": 0.671, |
|
"mean_token_accuracy": 0.8142693549394607, |
|
"num_tokens": 353250061.0, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8142685129440096, |
|
"grad_norm": 0.8011770165014716, |
|
"learning_rate": 3.717639975918122e-06, |
|
"loss": 0.7228, |
|
"mean_token_accuracy": 0.8017405867576599, |
|
"num_tokens": 353905112.0, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.8157736303431667, |
|
"grad_norm": 0.8672249780085222, |
|
"learning_rate": 3.6875376279349796e-06, |
|
"loss": 0.6968, |
|
"mean_token_accuracy": 0.8080266386270523, |
|
"num_tokens": 354557456.0, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.8172787477423239, |
|
"grad_norm": 0.8120117834431909, |
|
"learning_rate": 3.6574352799518364e-06, |
|
"loss": 0.6925, |
|
"mean_token_accuracy": 0.8095671758055687, |
|
"num_tokens": 355210797.0, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.818783865141481, |
|
"grad_norm": 0.863727834809856, |
|
"learning_rate": 3.627332931968694e-06, |
|
"loss": 0.7268, |
|
"mean_token_accuracy": 0.8012472525238991, |
|
"num_tokens": 355866157.0, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.8202889825406382, |
|
"grad_norm": 0.781104146631108, |
|
"learning_rate": 3.597230583985551e-06, |
|
"loss": 0.6399, |
|
"mean_token_accuracy": 0.8211751446127892, |
|
"num_tokens": 356519491.0, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.8217940999397954, |
|
"grad_norm": 0.79463590801861, |
|
"learning_rate": 3.5671282360024086e-06, |
|
"loss": 0.6979, |
|
"mean_token_accuracy": 0.8080309733748436, |
|
"num_tokens": 357173455.0, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.8232992173389524, |
|
"grad_norm": 0.7213195682522814, |
|
"learning_rate": 3.537025888019266e-06, |
|
"loss": 0.6714, |
|
"mean_token_accuracy": 0.8130217432975769, |
|
"num_tokens": 357828204.0, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.8248043347381095, |
|
"grad_norm": 0.8689083991804347, |
|
"learning_rate": 3.506923540036123e-06, |
|
"loss": 0.687, |
|
"mean_token_accuracy": 0.8101729631423951, |
|
"num_tokens": 358480367.0, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.8263094521372667, |
|
"grad_norm": 0.689761338219419, |
|
"learning_rate": 3.4768211920529803e-06, |
|
"loss": 0.7044, |
|
"mean_token_accuracy": 0.8075317278504371, |
|
"num_tokens": 359134857.0, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.8278145695364238, |
|
"grad_norm": 0.8806667703934201, |
|
"learning_rate": 3.446718844069838e-06, |
|
"loss": 0.6945, |
|
"mean_token_accuracy": 0.8091181218624115, |
|
"num_tokens": 359789043.0, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.829319686935581, |
|
"grad_norm": 0.7766566151875849, |
|
"learning_rate": 3.416616496086695e-06, |
|
"loss": 0.6394, |
|
"mean_token_accuracy": 0.8194618910551071, |
|
"num_tokens": 360442479.0, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.8308248043347382, |
|
"grad_norm": 0.7831351840930362, |
|
"learning_rate": 3.3865141481035525e-06, |
|
"loss": 0.6729, |
|
"mean_token_accuracy": 0.8144933819770813, |
|
"num_tokens": 361097254.0, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.8323299217338952, |
|
"grad_norm": 0.8509168580247635, |
|
"learning_rate": 3.3564118001204097e-06, |
|
"loss": 0.6533, |
|
"mean_token_accuracy": 0.819504565000534, |
|
"num_tokens": 361750574.0, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.8338350391330523, |
|
"grad_norm": 0.7719049692325927, |
|
"learning_rate": 3.326309452137267e-06, |
|
"loss": 0.6956, |
|
"mean_token_accuracy": 0.8098696261644364, |
|
"num_tokens": 362405934.0, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.8353401565322095, |
|
"grad_norm": 0.8709709923622949, |
|
"learning_rate": 3.2962071041541242e-06, |
|
"loss": 0.6851, |
|
"mean_token_accuracy": 0.8118442639708519, |
|
"num_tokens": 363060679.0, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.8368452739313667, |
|
"grad_norm": 0.724128820806475, |
|
"learning_rate": 3.2661047561709815e-06, |
|
"loss": 0.7068, |
|
"mean_token_accuracy": 0.8077720448374748, |
|
"num_tokens": 363716039.0, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.8383503913305238, |
|
"grad_norm": 0.7771783891240635, |
|
"learning_rate": 3.2360024081878387e-06, |
|
"loss": 0.6975, |
|
"mean_token_accuracy": 0.8087139695882797, |
|
"num_tokens": 364371134.0, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.839855508729681, |
|
"grad_norm": 0.7469591339916538, |
|
"learning_rate": 3.2059000602046964e-06, |
|
"loss": 0.7087, |
|
"mean_token_accuracy": 0.806692723929882, |
|
"num_tokens": 365026494.0, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.841360626128838, |
|
"grad_norm": 0.6846928405954478, |
|
"learning_rate": 3.1757977122215532e-06, |
|
"loss": 0.6524, |
|
"mean_token_accuracy": 0.8203337088227272, |
|
"num_tokens": 365678047.0, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.8428657435279951, |
|
"grad_norm": 0.8165391902454103, |
|
"learning_rate": 3.145695364238411e-06, |
|
"loss": 0.7091, |
|
"mean_token_accuracy": 0.8084227561950683, |
|
"num_tokens": 366332246.0, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.8443708609271523, |
|
"grad_norm": 0.7661797869924849, |
|
"learning_rate": 3.115593016255268e-06, |
|
"loss": 0.6657, |
|
"mean_token_accuracy": 0.814234085381031, |
|
"num_tokens": 366987566.0, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.8458759783263095, |
|
"grad_norm": 0.8544597822050432, |
|
"learning_rate": 3.0854906682721254e-06, |
|
"loss": 0.6731, |
|
"mean_token_accuracy": 0.8137959286570549, |
|
"num_tokens": 367641959.0, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.8473810957254666, |
|
"grad_norm": 0.7163611803006235, |
|
"learning_rate": 3.0553883202889826e-06, |
|
"loss": 0.6752, |
|
"mean_token_accuracy": 0.814427162706852, |
|
"num_tokens": 368294664.0, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.8488862131246238, |
|
"grad_norm": 0.6787078659303751, |
|
"learning_rate": 3.0252859723058403e-06, |
|
"loss": 0.6763, |
|
"mean_token_accuracy": 0.8122038081288337, |
|
"num_tokens": 368948750.0, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.8503913305237809, |
|
"grad_norm": 0.7479345264165043, |
|
"learning_rate": 2.995183624322697e-06, |
|
"loss": 0.6783, |
|
"mean_token_accuracy": 0.8135086625814438, |
|
"num_tokens": 369602071.0, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.851896447922938, |
|
"grad_norm": 0.8040713418680959, |
|
"learning_rate": 2.965081276339555e-06, |
|
"loss": 0.6833, |
|
"mean_token_accuracy": 0.8125656425952912, |
|
"num_tokens": 370257431.0, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.8534015653220951, |
|
"grad_norm": 0.7144369120802795, |
|
"learning_rate": 2.9349789283564125e-06, |
|
"loss": 0.633, |
|
"mean_token_accuracy": 0.8221362918615341, |
|
"num_tokens": 370912305.0, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.8549066827212523, |
|
"grad_norm": 0.7067024966715704, |
|
"learning_rate": 2.9048765803732693e-06, |
|
"loss": 0.6501, |
|
"mean_token_accuracy": 0.8193739414215088, |
|
"num_tokens": 371565909.0, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.8564118001204094, |
|
"grad_norm": 0.8244129429342957, |
|
"learning_rate": 2.874774232390127e-06, |
|
"loss": 0.6643, |
|
"mean_token_accuracy": 0.8159393966197968, |
|
"num_tokens": 372221054.0, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.8579169175195666, |
|
"grad_norm": 0.6788719585230156, |
|
"learning_rate": 2.844671884406984e-06, |
|
"loss": 0.676, |
|
"mean_token_accuracy": 0.814545676112175, |
|
"num_tokens": 372876414.0, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.8594220349187237, |
|
"grad_norm": 0.8130384688506709, |
|
"learning_rate": 2.814569536423841e-06, |
|
"loss": 0.6965, |
|
"mean_token_accuracy": 0.808952122926712, |
|
"num_tokens": 373531774.0, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.8609271523178808, |
|
"grad_norm": 0.7567910527858757, |
|
"learning_rate": 2.7844671884406987e-06, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.8194347232580185, |
|
"num_tokens": 374183754.0, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.8624322697170379, |
|
"grad_norm": 0.8119398632578556, |
|
"learning_rate": 2.7543648404575555e-06, |
|
"loss": 0.7265, |
|
"mean_token_accuracy": 0.8022975295782089, |
|
"num_tokens": 374837993.0, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.863937387116195, |
|
"grad_norm": 0.7200406785036352, |
|
"learning_rate": 2.724262492474413e-06, |
|
"loss": 0.6936, |
|
"mean_token_accuracy": 0.8087088257074356, |
|
"num_tokens": 375491417.0, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.8654425045153522, |
|
"grad_norm": 0.8095096343416092, |
|
"learning_rate": 2.694160144491271e-06, |
|
"loss": 0.6638, |
|
"mean_token_accuracy": 0.815724229812622, |
|
"num_tokens": 376146777.0, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.8669476219145094, |
|
"grad_norm": 0.7838435257134089, |
|
"learning_rate": 2.6640577965081277e-06, |
|
"loss": 0.6436, |
|
"mean_token_accuracy": 0.8215755537152291, |
|
"num_tokens": 376801681.0, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.8684527393136665, |
|
"grad_norm": 0.7514700742048708, |
|
"learning_rate": 2.6339554485249854e-06, |
|
"loss": 0.6746, |
|
"mean_token_accuracy": 0.8149273306131363, |
|
"num_tokens": 377457041.0, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.8699578567128236, |
|
"grad_norm": 0.8210292942650229, |
|
"learning_rate": 2.6038531005418426e-06, |
|
"loss": 0.7044, |
|
"mean_token_accuracy": 0.8063116610050202, |
|
"num_tokens": 378107635.0, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.8714629741119807, |
|
"grad_norm": 0.7396056378113063, |
|
"learning_rate": 2.5737507525587e-06, |
|
"loss": 0.6691, |
|
"mean_token_accuracy": 0.8137347057461739, |
|
"num_tokens": 378761375.0, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.8729680915111379, |
|
"grad_norm": 0.8708274255602088, |
|
"learning_rate": 2.543648404575557e-06, |
|
"loss": 0.6323, |
|
"mean_token_accuracy": 0.8234889656305313, |
|
"num_tokens": 379415908.0, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.874473208910295, |
|
"grad_norm": 0.7943943272899322, |
|
"learning_rate": 2.5135460565924148e-06, |
|
"loss": 0.659, |
|
"mean_token_accuracy": 0.8163654163479805, |
|
"num_tokens": 380071268.0, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.8759783263094522, |
|
"grad_norm": 0.8493004790291228, |
|
"learning_rate": 2.4834437086092716e-06, |
|
"loss": 0.6711, |
|
"mean_token_accuracy": 0.8139961987733841, |
|
"num_tokens": 380724321.0, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.8774834437086093, |
|
"grad_norm": 0.8183163337114058, |
|
"learning_rate": 2.4533413606261293e-06, |
|
"loss": 0.6835, |
|
"mean_token_accuracy": 0.8111695215106011, |
|
"num_tokens": 381379080.0, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.8789885611077664, |
|
"grad_norm": 0.8344949442851441, |
|
"learning_rate": 2.4232390126429865e-06, |
|
"loss": 0.6557, |
|
"mean_token_accuracy": 0.8170079737901688, |
|
"num_tokens": 382032396.0, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.8804936785069235, |
|
"grad_norm": 0.8101785451764406, |
|
"learning_rate": 2.3931366646598438e-06, |
|
"loss": 0.6413, |
|
"mean_token_accuracy": 0.8225623875856399, |
|
"num_tokens": 382685435.0, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.8819987959060807, |
|
"grad_norm": 0.8931720749252036, |
|
"learning_rate": 2.363034316676701e-06, |
|
"loss": 0.6567, |
|
"mean_token_accuracy": 0.8178651258349419, |
|
"num_tokens": 383339721.0, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.8835039133052378, |
|
"grad_norm": 0.8064725902594196, |
|
"learning_rate": 2.3329319686935583e-06, |
|
"loss": 0.6808, |
|
"mean_token_accuracy": 0.8125994563102722, |
|
"num_tokens": 383993935.0, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.885009030704395, |
|
"grad_norm": 0.7727113900390351, |
|
"learning_rate": 2.3028296207104155e-06, |
|
"loss": 0.6607, |
|
"mean_token_accuracy": 0.8179493889212608, |
|
"num_tokens": 384648353.0, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.8865141481035521, |
|
"grad_norm": 0.8692477292585057, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 0.6836, |
|
"mean_token_accuracy": 0.8113144382834434, |
|
"num_tokens": 385302928.0, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.8880192655027093, |
|
"grad_norm": 0.8390504027322997, |
|
"learning_rate": 2.2426249247441304e-06, |
|
"loss": 0.7002, |
|
"mean_token_accuracy": 0.8089157208800316, |
|
"num_tokens": 385956340.0, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.8895243829018663, |
|
"grad_norm": 0.8150721235314988, |
|
"learning_rate": 2.2125225767609877e-06, |
|
"loss": 0.6929, |
|
"mean_token_accuracy": 0.8088451266288758, |
|
"num_tokens": 386610619.0, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.8910295003010235, |
|
"grad_norm": 0.8046328603226446, |
|
"learning_rate": 2.182420228777845e-06, |
|
"loss": 0.6519, |
|
"mean_token_accuracy": 0.81864313185215, |
|
"num_tokens": 387265979.0, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.8925346177001806, |
|
"grad_norm": 0.756159894396759, |
|
"learning_rate": 2.152317880794702e-06, |
|
"loss": 0.6786, |
|
"mean_token_accuracy": 0.8141127720475196, |
|
"num_tokens": 387919662.0, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.8940397350993378, |
|
"grad_norm": 0.7776430394692136, |
|
"learning_rate": 2.1222155328115594e-06, |
|
"loss": 0.6698, |
|
"mean_token_accuracy": 0.8145410984754562, |
|
"num_tokens": 388575022.0, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.8955448524984949, |
|
"grad_norm": 0.7865527176037842, |
|
"learning_rate": 2.0921131848284167e-06, |
|
"loss": 0.6801, |
|
"mean_token_accuracy": 0.8134633019566536, |
|
"num_tokens": 389230382.0, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.8970499698976521, |
|
"grad_norm": 0.7925365048007713, |
|
"learning_rate": 2.062010836845274e-06, |
|
"loss": 0.6873, |
|
"mean_token_accuracy": 0.8116836905479431, |
|
"num_tokens": 389883695.0, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.8985550872968091, |
|
"grad_norm": 0.8058845590867396, |
|
"learning_rate": 2.0319084888621316e-06, |
|
"loss": 0.682, |
|
"mean_token_accuracy": 0.8133537322282791, |
|
"num_tokens": 390537525.0, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.9000602046959663, |
|
"grad_norm": 0.820259998347803, |
|
"learning_rate": 2.001806140878989e-06, |
|
"loss": 0.6751, |
|
"mean_token_accuracy": 0.8127815544605255, |
|
"num_tokens": 391191825.0, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.9015653220951234, |
|
"grad_norm": 0.7974896272888695, |
|
"learning_rate": 1.971703792895846e-06, |
|
"loss": 0.6484, |
|
"mean_token_accuracy": 0.8203530460596085, |
|
"num_tokens": 391846060.0, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.9030704394942806, |
|
"grad_norm": 0.7428984425282655, |
|
"learning_rate": 1.9416014449127033e-06, |
|
"loss": 0.6588, |
|
"mean_token_accuracy": 0.8173960685729981, |
|
"num_tokens": 392500211.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9045755568934377, |
|
"grad_norm": 0.8040708292776918, |
|
"learning_rate": 1.9114990969295606e-06, |
|
"loss": 0.7021, |
|
"mean_token_accuracy": 0.8088914528489113, |
|
"num_tokens": 393150561.0, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.9060806742925949, |
|
"grad_norm": 0.8807619038953524, |
|
"learning_rate": 1.881396748946418e-06, |
|
"loss": 0.6416, |
|
"mean_token_accuracy": 0.8225369155406952, |
|
"num_tokens": 393805719.0, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.9075857916917519, |
|
"grad_norm": 0.8563259131265296, |
|
"learning_rate": 1.8512944009632753e-06, |
|
"loss": 0.6864, |
|
"mean_token_accuracy": 0.8107398003339767, |
|
"num_tokens": 394461079.0, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.8386967646599575, |
|
"learning_rate": 1.8211920529801325e-06, |
|
"loss": 0.6979, |
|
"mean_token_accuracy": 0.8091868460178375, |
|
"num_tokens": 395114814.0, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.9105960264900662, |
|
"grad_norm": 0.782498683599273, |
|
"learning_rate": 1.79108970499699e-06, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.8184771955013275, |
|
"num_tokens": 395766884.0, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.9121011438892234, |
|
"grad_norm": 0.7709652287388354, |
|
"learning_rate": 1.7609873570138472e-06, |
|
"loss": 0.6416, |
|
"mean_token_accuracy": 0.8183257013559342, |
|
"num_tokens": 396422148.0, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.9136062612883805, |
|
"grad_norm": 0.6976938880223098, |
|
"learning_rate": 1.7308850090307045e-06, |
|
"loss": 0.6577, |
|
"mean_token_accuracy": 0.8160424426198005, |
|
"num_tokens": 397075791.0, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.9151113786875377, |
|
"grad_norm": 0.7892179303260687, |
|
"learning_rate": 1.700782661047562e-06, |
|
"loss": 0.7052, |
|
"mean_token_accuracy": 0.8071174398064613, |
|
"num_tokens": 397730862.0, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.9166164960866947, |
|
"grad_norm": 0.7715099994986613, |
|
"learning_rate": 1.6706803130644192e-06, |
|
"loss": 0.6989, |
|
"mean_token_accuracy": 0.808410356938839, |
|
"num_tokens": 398383816.0, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.9181216134858519, |
|
"grad_norm": 0.7866013965422197, |
|
"learning_rate": 1.6405779650812764e-06, |
|
"loss": 0.6853, |
|
"mean_token_accuracy": 0.8117044195532799, |
|
"num_tokens": 399036130.0, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.919626730885009, |
|
"grad_norm": 0.71269595335957, |
|
"learning_rate": 1.6104756170981337e-06, |
|
"loss": 0.6567, |
|
"mean_token_accuracy": 0.8164850741624832, |
|
"num_tokens": 399689802.0, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.9211318482841662, |
|
"grad_norm": 0.7488056490664857, |
|
"learning_rate": 1.5803732691149911e-06, |
|
"loss": 0.6799, |
|
"mean_token_accuracy": 0.8110664993524551, |
|
"num_tokens": 400345162.0, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.9226369656833233, |
|
"grad_norm": 0.7458014489415327, |
|
"learning_rate": 1.5502709211318484e-06, |
|
"loss": 0.681, |
|
"mean_token_accuracy": 0.811719287931919, |
|
"num_tokens": 400999661.0, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.9241420830824805, |
|
"grad_norm": 0.7886363158801694, |
|
"learning_rate": 1.5201685731487056e-06, |
|
"loss": 0.6862, |
|
"mean_token_accuracy": 0.8114742293953896, |
|
"num_tokens": 401653950.0, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.9256472004816376, |
|
"grad_norm": 0.7391585776509119, |
|
"learning_rate": 1.490066225165563e-06, |
|
"loss": 0.696, |
|
"mean_token_accuracy": 0.8081017956137657, |
|
"num_tokens": 402309310.0, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.9271523178807947, |
|
"grad_norm": 0.770098349627336, |
|
"learning_rate": 1.4599638771824203e-06, |
|
"loss": 0.6457, |
|
"mean_token_accuracy": 0.8203165084123611, |
|
"num_tokens": 402964109.0, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.9286574352799518, |
|
"grad_norm": 0.7730099532771224, |
|
"learning_rate": 1.4298615291992776e-06, |
|
"loss": 0.6808, |
|
"mean_token_accuracy": 0.8133900195360184, |
|
"num_tokens": 403619469.0, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.930162552679109, |
|
"grad_norm": 0.7197758247011444, |
|
"learning_rate": 1.3997591812161348e-06, |
|
"loss": 0.6609, |
|
"mean_token_accuracy": 0.8174248903989791, |
|
"num_tokens": 404271956.0, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.9316676700782661, |
|
"grad_norm": 0.7643788659356211, |
|
"learning_rate": 1.3696568332329923e-06, |
|
"loss": 0.6629, |
|
"mean_token_accuracy": 0.8163714617490768, |
|
"num_tokens": 404926972.0, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.9331727874774233, |
|
"grad_norm": 0.8003243746970174, |
|
"learning_rate": 1.3395544852498495e-06, |
|
"loss": 0.6571, |
|
"mean_token_accuracy": 0.8182735562324523, |
|
"num_tokens": 405581226.0, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.9346779048765804, |
|
"grad_norm": 0.8390202133619289, |
|
"learning_rate": 1.3094521372667068e-06, |
|
"loss": 0.6646, |
|
"mean_token_accuracy": 0.8168047055602073, |
|
"num_tokens": 406236010.0, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.9361830222757375, |
|
"grad_norm": 0.8218023276195088, |
|
"learning_rate": 1.2793497892835642e-06, |
|
"loss": 0.6798, |
|
"mean_token_accuracy": 0.813007053732872, |
|
"num_tokens": 406889781.0, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.9376881396748946, |
|
"grad_norm": 0.8049728095384475, |
|
"learning_rate": 1.2492474413004215e-06, |
|
"loss": 0.6977, |
|
"mean_token_accuracy": 0.8095493704080582, |
|
"num_tokens": 407542794.0, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.9391932570740518, |
|
"grad_norm": 0.7952147884853261, |
|
"learning_rate": 1.2191450933172787e-06, |
|
"loss": 0.671, |
|
"mean_token_accuracy": 0.8143075197935105, |
|
"num_tokens": 408198154.0, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.9406983744732089, |
|
"grad_norm": 0.7599989329771313, |
|
"learning_rate": 1.1890427453341362e-06, |
|
"loss": 0.6488, |
|
"mean_token_accuracy": 0.8212261810898781, |
|
"num_tokens": 408853514.0, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.9422034918723661, |
|
"grad_norm": 0.7216867579887762, |
|
"learning_rate": 1.1589403973509934e-06, |
|
"loss": 0.6347, |
|
"mean_token_accuracy": 0.8228215038776397, |
|
"num_tokens": 409508874.0, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.9437086092715232, |
|
"grad_norm": 0.7067922141116312, |
|
"learning_rate": 1.1288380493678507e-06, |
|
"loss": 0.659, |
|
"mean_token_accuracy": 0.8190646037459374, |
|
"num_tokens": 410163248.0, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.9452137266706803, |
|
"grad_norm": 0.948728528316048, |
|
"learning_rate": 1.098735701384708e-06, |
|
"loss": 0.643, |
|
"mean_token_accuracy": 0.8224123731255532, |
|
"num_tokens": 410818608.0, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.9467188440698374, |
|
"grad_norm": 0.7892350481303253, |
|
"learning_rate": 1.0686333534015654e-06, |
|
"loss": 0.6608, |
|
"mean_token_accuracy": 0.8173749342560768, |
|
"num_tokens": 411471107.0, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.9482239614689946, |
|
"grad_norm": 0.8083478104312511, |
|
"learning_rate": 1.0385310054184229e-06, |
|
"loss": 0.6872, |
|
"mean_token_accuracy": 0.811252748966217, |
|
"num_tokens": 412126467.0, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.9497290788681517, |
|
"grad_norm": 0.8938476500962467, |
|
"learning_rate": 1.00842865743528e-06, |
|
"loss": 0.6601, |
|
"mean_token_accuracy": 0.8179592058062554, |
|
"num_tokens": 412781827.0, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.9512341962673089, |
|
"grad_norm": 0.8596759733933999, |
|
"learning_rate": 9.783263094521374e-07, |
|
"loss": 0.6571, |
|
"mean_token_accuracy": 0.8197621509432793, |
|
"num_tokens": 413436727.0, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.952739313666466, |
|
"grad_norm": 0.7467533643516227, |
|
"learning_rate": 9.482239614689946e-07, |
|
"loss": 0.6792, |
|
"mean_token_accuracy": 0.8140778690576553, |
|
"num_tokens": 414088041.0, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.9542444310656231, |
|
"grad_norm": 0.7518313373444699, |
|
"learning_rate": 9.18121613485852e-07, |
|
"loss": 0.6671, |
|
"mean_token_accuracy": 0.8171729937195777, |
|
"num_tokens": 414743401.0, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.9557495484647802, |
|
"grad_norm": 0.7147981832977893, |
|
"learning_rate": 8.880192655027092e-07, |
|
"loss": 0.6693, |
|
"mean_token_accuracy": 0.8162096992135048, |
|
"num_tokens": 415398761.0, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.9572546658639374, |
|
"grad_norm": 0.7787214649354509, |
|
"learning_rate": 8.579169175195666e-07, |
|
"loss": 0.6587, |
|
"mean_token_accuracy": 0.8199209123849869, |
|
"num_tokens": 416054121.0, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.9587597832630945, |
|
"grad_norm": 0.7557139318657753, |
|
"learning_rate": 8.27814569536424e-07, |
|
"loss": 0.6562, |
|
"mean_token_accuracy": 0.8186782419681549, |
|
"num_tokens": 416709481.0, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.9602649006622517, |
|
"grad_norm": 0.8281637709136592, |
|
"learning_rate": 7.977122215532813e-07, |
|
"loss": 0.6656, |
|
"mean_token_accuracy": 0.8177010849118233, |
|
"num_tokens": 417363825.0, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.9617700180614088, |
|
"grad_norm": 0.8103144240736111, |
|
"learning_rate": 7.676098735701386e-07, |
|
"loss": 0.6987, |
|
"mean_token_accuracy": 0.8090856596827507, |
|
"num_tokens": 418018667.0, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.963275135460566, |
|
"grad_norm": 0.8780690778093717, |
|
"learning_rate": 7.375075255869959e-07, |
|
"loss": 0.6565, |
|
"mean_token_accuracy": 0.8164542749524116, |
|
"num_tokens": 418673242.0, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.964780252859723, |
|
"grad_norm": 0.8301073825181444, |
|
"learning_rate": 7.074051776038532e-07, |
|
"loss": 0.7105, |
|
"mean_token_accuracy": 0.8061650216579437, |
|
"num_tokens": 419328467.0, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.9662853702588802, |
|
"grad_norm": 0.7743398747402613, |
|
"learning_rate": 6.773028296207105e-07, |
|
"loss": 0.6952, |
|
"mean_token_accuracy": 0.8110766768455505, |
|
"num_tokens": 419983412.0, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.9677904876580373, |
|
"grad_norm": 0.7729942676504536, |
|
"learning_rate": 6.472004816375678e-07, |
|
"loss": 0.6842, |
|
"mean_token_accuracy": 0.8116374552249909, |
|
"num_tokens": 420638772.0, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.9692956050571945, |
|
"grad_norm": 0.8396932408066435, |
|
"learning_rate": 6.170981336544251e-07, |
|
"loss": 0.624, |
|
"mean_token_accuracy": 0.8242242723703385, |
|
"num_tokens": 421293329.0, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.9708007224563516, |
|
"grad_norm": 0.7456802986717396, |
|
"learning_rate": 5.869957856712824e-07, |
|
"loss": 0.6733, |
|
"mean_token_accuracy": 0.8136846616864204, |
|
"num_tokens": 421948689.0, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.9723058398555088, |
|
"grad_norm": 0.7987936501010671, |
|
"learning_rate": 5.568934376881397e-07, |
|
"loss": 0.6318, |
|
"mean_token_accuracy": 0.8227405115962029, |
|
"num_tokens": 422603207.0, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.9738109572546658, |
|
"grad_norm": 0.7936383192328358, |
|
"learning_rate": 5.26791089704997e-07, |
|
"loss": 0.6563, |
|
"mean_token_accuracy": 0.8187880471348763, |
|
"num_tokens": 423254732.0, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.975316074653823, |
|
"grad_norm": 0.7432652222612889, |
|
"learning_rate": 4.966887417218544e-07, |
|
"loss": 0.6487, |
|
"mean_token_accuracy": 0.818960678577423, |
|
"num_tokens": 423910092.0, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.9768211920529801, |
|
"grad_norm": 0.8511981353686215, |
|
"learning_rate": 4.6658639373871166e-07, |
|
"loss": 0.6799, |
|
"mean_token_accuracy": 0.8121935516595841, |
|
"num_tokens": 424564826.0, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.9783263094521373, |
|
"grad_norm": 0.7313398696799408, |
|
"learning_rate": 4.36484045755569e-07, |
|
"loss": 0.7109, |
|
"mean_token_accuracy": 0.8065851047635079, |
|
"num_tokens": 425219708.0, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9798314268512944, |
|
"grad_norm": 0.7750629777088383, |
|
"learning_rate": 4.063816977724263e-07, |
|
"loss": 0.6841, |
|
"mean_token_accuracy": 0.8126775458455086, |
|
"num_tokens": 425873155.0, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.9813365442504516, |
|
"grad_norm": 0.7744993603101556, |
|
"learning_rate": 3.762793497892836e-07, |
|
"loss": 0.6323, |
|
"mean_token_accuracy": 0.8231482058763504, |
|
"num_tokens": 426528515.0, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.9828416616496086, |
|
"grad_norm": 0.7716391137183645, |
|
"learning_rate": 3.461770018061409e-07, |
|
"loss": 0.6716, |
|
"mean_token_accuracy": 0.814750799536705, |
|
"num_tokens": 427183304.0, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.9843467790487658, |
|
"grad_norm": 0.8256084259353021, |
|
"learning_rate": 3.160746538229982e-07, |
|
"loss": 0.638, |
|
"mean_token_accuracy": 0.8222943916916847, |
|
"num_tokens": 427835957.0, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.9858518964479229, |
|
"grad_norm": 0.7470228300092132, |
|
"learning_rate": 2.8597230583985557e-07, |
|
"loss": 0.6029, |
|
"mean_token_accuracy": 0.8287707567214966, |
|
"num_tokens": 428491317.0, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.9873570138470801, |
|
"grad_norm": 0.7560617660239669, |
|
"learning_rate": 2.5586995785671287e-07, |
|
"loss": 0.6709, |
|
"mean_token_accuracy": 0.8148993030190468, |
|
"num_tokens": 429142202.0, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.9888621312462372, |
|
"grad_norm": 0.7086279970183982, |
|
"learning_rate": 2.2576760987357014e-07, |
|
"loss": 0.6567, |
|
"mean_token_accuracy": 0.8180391117930412, |
|
"num_tokens": 429796213.0, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.9903672486453944, |
|
"grad_norm": 0.7788087082450003, |
|
"learning_rate": 1.9566526189042744e-07, |
|
"loss": 0.701, |
|
"mean_token_accuracy": 0.8085399270057678, |
|
"num_tokens": 430447380.0, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.9918723660445514, |
|
"grad_norm": 0.7878945507101885, |
|
"learning_rate": 1.655629139072848e-07, |
|
"loss": 0.6506, |
|
"mean_token_accuracy": 0.8179336041212082, |
|
"num_tokens": 431101296.0, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.9933774834437086, |
|
"grad_norm": 0.7121310866638965, |
|
"learning_rate": 1.354605659241421e-07, |
|
"loss": 0.6688, |
|
"mean_token_accuracy": 0.8157944530248642, |
|
"num_tokens": 431756656.0, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.9948826008428657, |
|
"grad_norm": 0.8306725658530479, |
|
"learning_rate": 1.053582179409994e-07, |
|
"loss": 0.6956, |
|
"mean_token_accuracy": 0.8097556039690972, |
|
"num_tokens": 432411158.0, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.9963877182420229, |
|
"grad_norm": 0.8661659471981039, |
|
"learning_rate": 7.525586995785672e-08, |
|
"loss": 0.6507, |
|
"mean_token_accuracy": 0.8198384776711464, |
|
"num_tokens": 433066518.0, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.99789283564118, |
|
"grad_norm": 0.8034747470725212, |
|
"learning_rate": 4.5153521974714037e-08, |
|
"loss": 0.7082, |
|
"mean_token_accuracy": 0.8069040149450302, |
|
"num_tokens": 433720023.0, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.9993979530403372, |
|
"grad_norm": 0.7599732131076518, |
|
"learning_rate": 1.5051173991571343e-08, |
|
"loss": 0.6537, |
|
"mean_token_accuracy": 0.8202048733830452, |
|
"num_tokens": 434375383.0, |
|
"step": 6640 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6644, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 695533043712000.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|