|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.272984441301273,
|
|
"eval_steps": 500,
|
|
"global_step": 900,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.014144271570014143,
|
|
"grad_norm": 85.7519302368164,
|
|
"learning_rate": 4.976426214049977e-05,
|
|
"loss": 9.3898,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.028288543140028287,
|
|
"grad_norm": 33.235904693603516,
|
|
"learning_rate": 4.952852428099953e-05,
|
|
"loss": 5.3413,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.042432814710042434,
|
|
"grad_norm": 5.113602161407471,
|
|
"learning_rate": 4.9292786421499294e-05,
|
|
"loss": 4.854,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.056577086280056574,
|
|
"grad_norm": 36.069705963134766,
|
|
"learning_rate": 4.9057048561999055e-05,
|
|
"loss": 5.0425,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.07072135785007072,
|
|
"grad_norm": 33.71372604370117,
|
|
"learning_rate": 4.882131070249882e-05,
|
|
"loss": 5.04,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.08486562942008487,
|
|
"grad_norm": 10.95967960357666,
|
|
"learning_rate": 4.858557284299859e-05,
|
|
"loss": 4.5402,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.09900990099009901,
|
|
"grad_norm": 57.9084358215332,
|
|
"learning_rate": 4.834983498349835e-05,
|
|
"loss": 4.8505,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.11315417256011315,
|
|
"grad_norm": 34.007869720458984,
|
|
"learning_rate": 4.8114097123998114e-05,
|
|
"loss": 4.3305,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.1272984441301273,
|
|
"grad_norm": 15.245634078979492,
|
|
"learning_rate": 4.787835926449788e-05,
|
|
"loss": 4.4707,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.14144271570014144,
|
|
"grad_norm": 24.47016143798828,
|
|
"learning_rate": 4.7642621404997644e-05,
|
|
"loss": 4.5379,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.15558698727015557,
|
|
"grad_norm": 14.591358184814453,
|
|
"learning_rate": 4.740688354549741e-05,
|
|
"loss": 4.5205,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.16973125884016974,
|
|
"grad_norm": 18.776493072509766,
|
|
"learning_rate": 4.7171145685997174e-05,
|
|
"loss": 4.6578,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.18387553041018387,
|
|
"grad_norm": 4.018069267272949,
|
|
"learning_rate": 4.6935407826496935e-05,
|
|
"loss": 4.1864,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.19801980198019803,
|
|
"grad_norm": 8.038480758666992,
|
|
"learning_rate": 4.6699669966996704e-05,
|
|
"loss": 4.5046,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.21216407355021216,
|
|
"grad_norm": 2.9573426246643066,
|
|
"learning_rate": 4.6463932107496465e-05,
|
|
"loss": 4.2317,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.2263083451202263,
|
|
"grad_norm": 3.0037927627563477,
|
|
"learning_rate": 4.622819424799623e-05,
|
|
"loss": 4.0951,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.24045261669024046,
|
|
"grad_norm": 9.886232376098633,
|
|
"learning_rate": 4.5992456388495995e-05,
|
|
"loss": 4.0168,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.2545968882602546,
|
|
"grad_norm": 2.463179588317871,
|
|
"learning_rate": 4.5756718528995756e-05,
|
|
"loss": 4.0372,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.26874115983026875,
|
|
"grad_norm": 13.546555519104004,
|
|
"learning_rate": 4.5520980669495525e-05,
|
|
"loss": 4.1929,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.2828854314002829,
|
|
"grad_norm": 2.4467103481292725,
|
|
"learning_rate": 4.5285242809995286e-05,
|
|
"loss": 4.2146,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.297029702970297,
|
|
"grad_norm": 5.936313152313232,
|
|
"learning_rate": 4.5049504950495054e-05,
|
|
"loss": 4.6069,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.31117397454031115,
|
|
"grad_norm": 6.5272536277771,
|
|
"learning_rate": 4.4813767090994816e-05,
|
|
"loss": 4.3263,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.32531824611032534,
|
|
"grad_norm": 4.2881598472595215,
|
|
"learning_rate": 4.457802923149458e-05,
|
|
"loss": 3.8218,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.33946251768033947,
|
|
"grad_norm": 3.9945058822631836,
|
|
"learning_rate": 4.4342291371994345e-05,
|
|
"loss": 4.1532,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.3536067892503536,
|
|
"grad_norm": 4.577730655670166,
|
|
"learning_rate": 4.410655351249411e-05,
|
|
"loss": 4.0727,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.36775106082036774,
|
|
"grad_norm": 2.6052353382110596,
|
|
"learning_rate": 4.3870815652993875e-05,
|
|
"loss": 3.9074,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.38189533239038187,
|
|
"grad_norm": 15.787618637084961,
|
|
"learning_rate": 4.363507779349364e-05,
|
|
"loss": 4.2402,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.39603960396039606,
|
|
"grad_norm": 23.970670700073242,
|
|
"learning_rate": 4.33993399339934e-05,
|
|
"loss": 4.3765,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.4101838755304102,
|
|
"grad_norm": 2.5313973426818848,
|
|
"learning_rate": 4.3163602074493166e-05,
|
|
"loss": 3.9863,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.4243281471004243,
|
|
"grad_norm": 21.020267486572266,
|
|
"learning_rate": 4.292786421499293e-05,
|
|
"loss": 4.0341,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.43847241867043846,
|
|
"grad_norm": 9.731268882751465,
|
|
"learning_rate": 4.2692126355492696e-05,
|
|
"loss": 4.1089,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.4526166902404526,
|
|
"grad_norm": 4.240326881408691,
|
|
"learning_rate": 4.245638849599246e-05,
|
|
"loss": 3.9737,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.4667609618104668,
|
|
"grad_norm": 15.72867202758789,
|
|
"learning_rate": 4.222065063649222e-05,
|
|
"loss": 4.454,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.4809052333804809,
|
|
"grad_norm": 10.669405937194824,
|
|
"learning_rate": 4.198491277699199e-05,
|
|
"loss": 4.1596,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.49504950495049505,
|
|
"grad_norm": 11.927492141723633,
|
|
"learning_rate": 4.174917491749175e-05,
|
|
"loss": 4.0485,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.5091937765205092,
|
|
"grad_norm": 7.629958629608154,
|
|
"learning_rate": 4.151343705799152e-05,
|
|
"loss": 4.1567,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.5233380480905233,
|
|
"grad_norm": 32.22209930419922,
|
|
"learning_rate": 4.1277699198491285e-05,
|
|
"loss": 4.9187,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.5374823196605375,
|
|
"grad_norm": 7.841526985168457,
|
|
"learning_rate": 4.104196133899104e-05,
|
|
"loss": 4.0989,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.5516265912305516,
|
|
"grad_norm": 3.8099868297576904,
|
|
"learning_rate": 4.080622347949081e-05,
|
|
"loss": 4.4586,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.5657708628005658,
|
|
"grad_norm": 11.720135688781738,
|
|
"learning_rate": 4.057048561999057e-05,
|
|
"loss": 4.1915,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.57991513437058,
|
|
"grad_norm": 5.8960280418396,
|
|
"learning_rate": 4.033474776049034e-05,
|
|
"loss": 4.3458,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.594059405940594,
|
|
"grad_norm": 3.532780885696411,
|
|
"learning_rate": 4.0099009900990106e-05,
|
|
"loss": 4.2094,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.6082036775106082,
|
|
"grad_norm": 6.77498722076416,
|
|
"learning_rate": 3.986327204148986e-05,
|
|
"loss": 4.3139,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.6223479490806223,
|
|
"grad_norm": 23.035005569458008,
|
|
"learning_rate": 3.962753418198963e-05,
|
|
"loss": 4.0213,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.6364922206506365,
|
|
"grad_norm": 3.033621311187744,
|
|
"learning_rate": 3.939179632248939e-05,
|
|
"loss": 4.0612,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.6506364922206507,
|
|
"grad_norm": 32.6967887878418,
|
|
"learning_rate": 3.915605846298916e-05,
|
|
"loss": 4.1847,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.6647807637906648,
|
|
"grad_norm": 9.779464721679688,
|
|
"learning_rate": 3.892032060348893e-05,
|
|
"loss": 4.1741,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.6789250353606789,
|
|
"grad_norm": 14.904414176940918,
|
|
"learning_rate": 3.868458274398868e-05,
|
|
"loss": 4.1909,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.693069306930693,
|
|
"grad_norm": 34.94367218017578,
|
|
"learning_rate": 3.844884488448845e-05,
|
|
"loss": 4.9592,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.7072135785007072,
|
|
"grad_norm": 6.339000701904297,
|
|
"learning_rate": 3.821310702498822e-05,
|
|
"loss": 3.7726,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.7213578500707214,
|
|
"grad_norm": 17.672000885009766,
|
|
"learning_rate": 3.797736916548798e-05,
|
|
"loss": 3.9599,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.7355021216407355,
|
|
"grad_norm": 13.348356246948242,
|
|
"learning_rate": 3.774163130598775e-05,
|
|
"loss": 3.8342,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.7496463932107497,
|
|
"grad_norm": 1.8930085897445679,
|
|
"learning_rate": 3.75058934464875e-05,
|
|
"loss": 4.0049,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.7637906647807637,
|
|
"grad_norm": 35.62409210205078,
|
|
"learning_rate": 3.727015558698727e-05,
|
|
"loss": 3.9412,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.7779349363507779,
|
|
"grad_norm": 2.246541738510132,
|
|
"learning_rate": 3.703441772748704e-05,
|
|
"loss": 3.893,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.7920792079207921,
|
|
"grad_norm": 57.89748001098633,
|
|
"learning_rate": 3.67986798679868e-05,
|
|
"loss": 4.2004,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.8062234794908062,
|
|
"grad_norm": 13.958605766296387,
|
|
"learning_rate": 3.656294200848657e-05,
|
|
"loss": 4.5987,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.8203677510608204,
|
|
"grad_norm": 7.963130950927734,
|
|
"learning_rate": 3.6327204148986324e-05,
|
|
"loss": 3.8688,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.8345120226308345,
|
|
"grad_norm": 12.124194145202637,
|
|
"learning_rate": 3.609146628948609e-05,
|
|
"loss": 3.8787,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.8486562942008486,
|
|
"grad_norm": 19.39701271057129,
|
|
"learning_rate": 3.585572842998586e-05,
|
|
"loss": 3.8274,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.8628005657708628,
|
|
"grad_norm": 7.561882495880127,
|
|
"learning_rate": 3.561999057048562e-05,
|
|
"loss": 3.917,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.8769448373408769,
|
|
"grad_norm": 8.699311256408691,
|
|
"learning_rate": 3.538425271098539e-05,
|
|
"loss": 3.8819,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.8910891089108911,
|
|
"grad_norm": 10.60632038116455,
|
|
"learning_rate": 3.514851485148515e-05,
|
|
"loss": 4.3288,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.9052333804809052,
|
|
"grad_norm": 5.851240634918213,
|
|
"learning_rate": 3.491277699198491e-05,
|
|
"loss": 3.7157,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.9193776520509194,
|
|
"grad_norm": 12.624049186706543,
|
|
"learning_rate": 3.467703913248468e-05,
|
|
"loss": 4.0151,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.9335219236209336,
|
|
"grad_norm": 10.379075050354004,
|
|
"learning_rate": 3.444130127298444e-05,
|
|
"loss": 4.1419,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.9476661951909476,
|
|
"grad_norm": 11.247940063476562,
|
|
"learning_rate": 3.420556341348421e-05,
|
|
"loss": 3.5958,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.9618104667609618,
|
|
"grad_norm": 10.014704704284668,
|
|
"learning_rate": 3.396982555398397e-05,
|
|
"loss": 3.7822,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.9759547383309759,
|
|
"grad_norm": 8.791955947875977,
|
|
"learning_rate": 3.3734087694483734e-05,
|
|
"loss": 3.9809,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.9900990099009901,
|
|
"grad_norm": 30.620357513427734,
|
|
"learning_rate": 3.34983498349835e-05,
|
|
"loss": 3.9702,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.0042432814710043,
|
|
"grad_norm": 23.29230499267578,
|
|
"learning_rate": 3.326261197548326e-05,
|
|
"loss": 3.8758,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.0183875530410185,
|
|
"grad_norm": 6.364682674407959,
|
|
"learning_rate": 3.302687411598303e-05,
|
|
"loss": 3.6476,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.0325318246110324,
|
|
"grad_norm": 22.594091415405273,
|
|
"learning_rate": 3.279113625648279e-05,
|
|
"loss": 4.2427,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.0466760961810466,
|
|
"grad_norm": 31.865617752075195,
|
|
"learning_rate": 3.2555398396982555e-05,
|
|
"loss": 3.8927,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.0608203677510608,
|
|
"grad_norm": 2.553858757019043,
|
|
"learning_rate": 3.231966053748232e-05,
|
|
"loss": 3.6406,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.074964639321075,
|
|
"grad_norm": 24.558555603027344,
|
|
"learning_rate": 3.2083922677982084e-05,
|
|
"loss": 3.9391,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.0891089108910892,
|
|
"grad_norm": 3.9098362922668457,
|
|
"learning_rate": 3.184818481848185e-05,
|
|
"loss": 4.2241,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.1032531824611032,
|
|
"grad_norm": 14.435652732849121,
|
|
"learning_rate": 3.1612446958981614e-05,
|
|
"loss": 3.5212,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.1173974540311173,
|
|
"grad_norm": 4.891509056091309,
|
|
"learning_rate": 3.1376709099481375e-05,
|
|
"loss": 3.3049,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.1315417256011315,
|
|
"grad_norm": 10.893304824829102,
|
|
"learning_rate": 3.1140971239981144e-05,
|
|
"loss": 3.7428,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.1456859971711457,
|
|
"grad_norm": 3.631542921066284,
|
|
"learning_rate": 3.0905233380480905e-05,
|
|
"loss": 3.7109,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.15983026874116,
|
|
"grad_norm": 17.292734146118164,
|
|
"learning_rate": 3.0669495520980673e-05,
|
|
"loss": 3.748,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.1739745403111739,
|
|
"grad_norm": 12.438305854797363,
|
|
"learning_rate": 3.043375766148043e-05,
|
|
"loss": 4.5263,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.188118811881188,
|
|
"grad_norm": 7.694697380065918,
|
|
"learning_rate": 3.01980198019802e-05,
|
|
"loss": 3.9575,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.2022630834512023,
|
|
"grad_norm": 9.88021469116211,
|
|
"learning_rate": 2.9962281942479965e-05,
|
|
"loss": 3.5793,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.2164073550212164,
|
|
"grad_norm": 18.16057586669922,
|
|
"learning_rate": 2.972654408297973e-05,
|
|
"loss": 3.7951,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.2305516265912306,
|
|
"grad_norm": 3.5214946269989014,
|
|
"learning_rate": 2.9490806223479494e-05,
|
|
"loss": 4.0263,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.2446958981612446,
|
|
"grad_norm": 20.135046005249023,
|
|
"learning_rate": 2.9255068363979256e-05,
|
|
"loss": 3.678,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.2588401697312588,
|
|
"grad_norm": 35.220733642578125,
|
|
"learning_rate": 2.901933050447902e-05,
|
|
"loss": 4.1981,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.272984441301273,
|
|
"grad_norm": 39.2838134765625,
|
|
"learning_rate": 2.8783592644978786e-05,
|
|
"loss": 3.9173,
|
|
"step": 900
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 2121,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.3469133437927424e+18,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|