Llama-Poro-2-70B-Instruct / trainer_state.json
jonabur's picture
Upload folder using huggingface_hub
f3e9ea0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1446,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002074688796680498,
"grad_norm": 5.411880763762789,
"learning_rate": 3.4482758620689654e-09,
"logits/chosen": 1.0625,
"logits/rejected": 1.078125,
"logps/chosen": -310.0,
"logps/rejected": -220.0,
"loss": 0.6914,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02074688796680498,
"grad_norm": 5.356267802595583,
"learning_rate": 3.448275862068965e-08,
"logits/chosen": -0.0576171875,
"logits/rejected": 0.01141357421875,
"logps/chosen": -394.0,
"logps/rejected": -374.0,
"loss": 0.6924,
"rewards/accuracies": 0.2222222238779068,
"rewards/chosen": -0.00445556640625,
"rewards/margins": -0.00445556640625,
"rewards/rejected": 0.0,
"step": 10
},
{
"epoch": 0.04149377593360996,
"grad_norm": 8.221470216280487,
"learning_rate": 6.89655172413793e-08,
"logits/chosen": -0.057373046875,
"logits/rejected": -0.1484375,
"logps/chosen": -304.0,
"logps/rejected": -364.0,
"loss": 0.6925,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": 0.0030059814453125,
"rewards/margins": -0.000499725341796875,
"rewards/rejected": 0.003509521484375,
"step": 20
},
{
"epoch": 0.06224066390041494,
"grad_norm": 4.685647512300122,
"learning_rate": 1.0344827586206897e-07,
"logits/chosen": 0.11767578125,
"logits/rejected": 0.09228515625,
"logps/chosen": -350.0,
"logps/rejected": -338.0,
"loss": 0.6918,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": 0.0019989013671875,
"rewards/margins": 0.0030059814453125,
"rewards/rejected": -0.00099945068359375,
"step": 30
},
{
"epoch": 0.08298755186721991,
"grad_norm": 5.392440324607591,
"learning_rate": 1.379310344827586e-07,
"logits/chosen": -0.1611328125,
"logits/rejected": -0.1640625,
"logps/chosen": -342.0,
"logps/rejected": -372.0,
"loss": 0.6916,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.00099945068359375,
"rewards/margins": -0.0030059814453125,
"rewards/rejected": 0.003997802734375,
"step": 40
},
{
"epoch": 0.1037344398340249,
"grad_norm": 5.066645187212037,
"learning_rate": 1.7241379310344828e-07,
"logits/chosen": 0.138671875,
"logits/rejected": 0.2412109375,
"logps/chosen": -262.0,
"logps/rejected": -184.0,
"loss": 0.6906,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": 0.0019989013671875,
"rewards/margins": -0.000499725341796875,
"rewards/rejected": 0.00250244140625,
"step": 50
},
{
"epoch": 0.12448132780082988,
"grad_norm": 5.592678715959439,
"learning_rate": 2.0689655172413793e-07,
"logits/chosen": -0.08349609375,
"logits/rejected": -0.0849609375,
"logps/chosen": -512.0,
"logps/rejected": -452.0,
"loss": 0.6884,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0240478515625,
"rewards/margins": 0.006500244140625,
"rewards/rejected": 0.017578125,
"step": 60
},
{
"epoch": 0.14522821576763487,
"grad_norm": 7.364077184884452,
"learning_rate": 2.413793103448276e-07,
"logits/chosen": 0.1728515625,
"logits/rejected": 0.11865234375,
"logps/chosen": -214.0,
"logps/rejected": -152.0,
"loss": 0.6847,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0240478515625,
"rewards/margins": 0.01904296875,
"rewards/rejected": 0.0050048828125,
"step": 70
},
{
"epoch": 0.16597510373443983,
"grad_norm": 4.583578088481169,
"learning_rate": 2.758620689655172e-07,
"logits/chosen": 0.111328125,
"logits/rejected": -0.031494140625,
"logps/chosen": -288.0,
"logps/rejected": -284.0,
"loss": 0.6819,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0279541015625,
"rewards/margins": 0.0029754638671875,
"rewards/rejected": 0.0250244140625,
"step": 80
},
{
"epoch": 0.18672199170124482,
"grad_norm": 4.763001538688379,
"learning_rate": 3.103448275862069e-07,
"logits/chosen": 0.006011962890625,
"logits/rejected": 0.0810546875,
"logps/chosen": -414.0,
"logps/rejected": -404.0,
"loss": 0.6792,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.1044921875,
"rewards/margins": 0.052978515625,
"rewards/rejected": 0.051513671875,
"step": 90
},
{
"epoch": 0.2074688796680498,
"grad_norm": 4.896926355089114,
"learning_rate": 3.4482758620689656e-07,
"logits/chosen": -0.0257568359375,
"logits/rejected": 0.05859375,
"logps/chosen": -390.0,
"logps/rejected": -324.0,
"loss": 0.6671,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0966796875,
"rewards/margins": 0.09765625,
"rewards/rejected": -0.000537872314453125,
"step": 100
},
{
"epoch": 0.22821576763485477,
"grad_norm": 4.96333643751103,
"learning_rate": 3.793103448275862e-07,
"logits/chosen": 0.0032958984375,
"logits/rejected": 0.003082275390625,
"logps/chosen": -286.0,
"logps/rejected": -296.0,
"loss": 0.6556,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.06494140625,
"rewards/margins": 0.05908203125,
"rewards/rejected": 0.006011962890625,
"step": 110
},
{
"epoch": 0.24896265560165975,
"grad_norm": 5.0505635416993035,
"learning_rate": 4.1379310344827586e-07,
"logits/chosen": -0.1982421875,
"logits/rejected": -0.1962890625,
"logps/chosen": -432.0,
"logps/rejected": -312.0,
"loss": 0.6511,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07080078125,
"rewards/margins": 0.06884765625,
"rewards/rejected": -0.1396484375,
"step": 120
},
{
"epoch": 0.2697095435684647,
"grad_norm": 5.108647391907642,
"learning_rate": 4.482758620689655e-07,
"logits/chosen": -0.2236328125,
"logits/rejected": -0.1015625,
"logps/chosen": -356.0,
"logps/rejected": -358.0,
"loss": 0.6199,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.0269775390625,
"rewards/margins": 0.2109375,
"rewards/rejected": -0.23828125,
"step": 130
},
{
"epoch": 0.29045643153526973,
"grad_norm": 6.872964653821398,
"learning_rate": 4.827586206896552e-07,
"logits/chosen": -0.228515625,
"logits/rejected": -0.236328125,
"logps/chosen": -376.0,
"logps/rejected": -290.0,
"loss": 0.6187,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.138671875,
"rewards/margins": 0.205078125,
"rewards/rejected": -0.34375,
"step": 140
},
{
"epoch": 0.3112033195020747,
"grad_norm": 6.529806229977509,
"learning_rate": 4.99981778257793e-07,
"logits/chosen": -0.453125,
"logits/rejected": -0.330078125,
"logps/chosen": -456.0,
"logps/rejected": -454.0,
"loss": 0.5776,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.5234375,
"rewards/margins": 0.19140625,
"rewards/rejected": -0.71484375,
"step": 150
},
{
"epoch": 0.33195020746887965,
"grad_norm": 10.04226648908633,
"learning_rate": 4.998360202572815e-07,
"logits/chosen": -0.353515625,
"logits/rejected": -0.486328125,
"logps/chosen": -424.0,
"logps/rejected": -430.0,
"loss": 0.53,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1328125,
"rewards/margins": 0.29296875,
"rewards/rejected": -1.4296875,
"step": 160
},
{
"epoch": 0.35269709543568467,
"grad_norm": 10.68261151488953,
"learning_rate": 4.995445892440316e-07,
"logits/chosen": -0.435546875,
"logits/rejected": -0.32421875,
"logps/chosen": -412.0,
"logps/rejected": -524.0,
"loss": 0.5588,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9140625,
"rewards/margins": 0.6640625,
"rewards/rejected": -1.578125,
"step": 170
},
{
"epoch": 0.37344398340248963,
"grad_norm": 6.73033593795782,
"learning_rate": 4.991076551440359e-07,
"logits/chosen": -0.5,
"logits/rejected": -0.57421875,
"logps/chosen": -632.0,
"logps/rejected": -648.0,
"loss": 0.554,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1015625,
"rewards/margins": 0.86328125,
"rewards/rejected": -1.96875,
"step": 180
},
{
"epoch": 0.3941908713692946,
"grad_norm": 9.816239750114915,
"learning_rate": 4.985254727224266e-07,
"logits/chosen": -0.2412109375,
"logits/rejected": -0.1767578125,
"logps/chosen": -416.0,
"logps/rejected": -476.0,
"loss": 0.5251,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.66015625,
"rewards/margins": 0.55859375,
"rewards/rejected": -1.21875,
"step": 190
},
{
"epoch": 0.4149377593360996,
"grad_norm": 7.484451121465821,
"learning_rate": 4.977983814349285e-07,
"logits/chosen": -0.345703125,
"logits/rejected": -0.353515625,
"logps/chosen": -432.0,
"logps/rejected": -498.0,
"loss": 0.5354,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.94921875,
"rewards/margins": 0.71484375,
"rewards/rejected": -1.6640625,
"step": 200
},
{
"epoch": 0.43568464730290457,
"grad_norm": 11.768546958658863,
"learning_rate": 4.969268052299307e-07,
"logits/chosen": -0.373046875,
"logits/rejected": -0.384765625,
"logps/chosen": -362.0,
"logps/rejected": -466.0,
"loss": 0.5133,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.92578125,
"rewards/margins": 1.046875,
"rewards/rejected": -1.9765625,
"step": 210
},
{
"epoch": 0.45643153526970953,
"grad_norm": 11.283753967080404,
"learning_rate": 4.959112523012938e-07,
"logits/chosen": -0.58203125,
"logits/rejected": -0.5859375,
"logps/chosen": -660.0,
"logps/rejected": -700.0,
"loss": 0.4936,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8828125,
"rewards/margins": 0.82421875,
"rewards/rejected": -2.703125,
"step": 220
},
{
"epoch": 0.47717842323651455,
"grad_norm": 8.364732476956826,
"learning_rate": 4.947523147920345e-07,
"logits/chosen": -0.5,
"logits/rejected": -0.4375,
"logps/chosen": -532.0,
"logps/rejected": -470.0,
"loss": 0.4925,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.109375,
"rewards/margins": 0.4140625,
"rewards/rejected": -1.5234375,
"step": 230
},
{
"epoch": 0.4979253112033195,
"grad_norm": 11.692843266160883,
"learning_rate": 4.934506684490621e-07,
"logits/chosen": -0.4921875,
"logits/rejected": -0.498046875,
"logps/chosen": -448.0,
"logps/rejected": -510.0,
"loss": 0.4734,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.94921875,
"rewards/margins": 0.89453125,
"rewards/rejected": -1.84375,
"step": 240
},
{
"epoch": 0.5186721991701245,
"grad_norm": 9.860975187266469,
"learning_rate": 4.920070722291682e-07,
"logits/chosen": -0.640625,
"logits/rejected": -0.65625,
"logps/chosen": -422.0,
"logps/rejected": -572.0,
"loss": 0.4902,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2109375,
"rewards/margins": 1.8984375,
"rewards/rejected": -3.109375,
"step": 250
},
{
"epoch": 0.5394190871369294,
"grad_norm": 13.494804689613495,
"learning_rate": 4.904223678564975e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.455078125,
"logps/chosen": -482.0,
"logps/rejected": -494.0,
"loss": 0.4797,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1015625,
"rewards/margins": 0.9296875,
"rewards/rejected": -2.03125,
"step": 260
},
{
"epoch": 0.5601659751037344,
"grad_norm": 12.703890931757634,
"learning_rate": 4.886974793317607e-07,
"logits/chosen": -0.376953125,
"logits/rejected": -0.46875,
"logps/chosen": -580.0,
"logps/rejected": -676.0,
"loss": 0.4202,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4453125,
"rewards/margins": 1.28125,
"rewards/rejected": -2.71875,
"step": 270
},
{
"epoch": 0.5809128630705395,
"grad_norm": 13.36542419547552,
"learning_rate": 4.86833412393473e-07,
"logits/chosen": -0.40234375,
"logits/rejected": -0.44921875,
"logps/chosen": -436.0,
"logps/rejected": -470.0,
"loss": 0.4683,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.671875,
"rewards/margins": 0.65625,
"rewards/rejected": -2.328125,
"step": 280
},
{
"epoch": 0.6016597510373444,
"grad_norm": 18.998627170239676,
"learning_rate": 4.848312539315334e-07,
"logits/chosen": -0.7578125,
"logits/rejected": -0.78515625,
"logps/chosen": -528.0,
"logps/rejected": -620.0,
"loss": 0.4245,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4453125,
"rewards/margins": 1.6640625,
"rewards/rejected": -3.109375,
"step": 290
},
{
"epoch": 0.6224066390041494,
"grad_norm": 16.88135073283805,
"learning_rate": 4.826921713534873e-07,
"logits/chosen": -0.3515625,
"logits/rejected": -0.53515625,
"logps/chosen": -552.0,
"logps/rejected": -640.0,
"loss": 0.4323,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.984375,
"rewards/margins": 0.6796875,
"rewards/rejected": -2.65625,
"step": 300
},
{
"epoch": 0.6431535269709544,
"grad_norm": 17.945027682392745,
"learning_rate": 4.804174119038404e-07,
"logits/chosen": -0.4140625,
"logits/rejected": -0.40234375,
"logps/chosen": -486.0,
"logps/rejected": -592.0,
"loss": 0.4525,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.2890625,
"rewards/margins": 1.515625,
"rewards/rejected": -2.8125,
"step": 310
},
{
"epoch": 0.6639004149377593,
"grad_norm": 9.847511875674074,
"learning_rate": 4.78008301936823e-07,
"logits/chosen": -0.67578125,
"logits/rejected": -0.5390625,
"logps/chosen": -548.0,
"logps/rejected": -676.0,
"loss": 0.4356,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.234375,
"rewards/margins": 1.734375,
"rewards/rejected": -2.96875,
"step": 320
},
{
"epoch": 0.6846473029045643,
"grad_norm": 11.752705584029137,
"learning_rate": 4.754662461430258e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.6328125,
"logps/chosen": -576.0,
"logps/rejected": -588.0,
"loss": 0.418,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.140625,
"rewards/margins": 1.34375,
"rewards/rejected": -3.484375,
"step": 330
},
{
"epoch": 0.7053941908713693,
"grad_norm": 16.053432840092594,
"learning_rate": 4.727927267303612e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.423828125,
"logps/chosen": -494.0,
"logps/rejected": -636.0,
"loss": 0.4206,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.109375,
"rewards/margins": 2.0,
"rewards/rejected": -4.09375,
"step": 340
},
{
"epoch": 0.7261410788381742,
"grad_norm": 11.842580622475824,
"learning_rate": 4.699893025598255e-07,
"logits/chosen": -0.66796875,
"logits/rejected": -0.671875,
"logps/chosen": -588.0,
"logps/rejected": -660.0,
"loss": 0.4345,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.28125,
"rewards/margins": 0.435546875,
"rewards/rejected": -2.703125,
"step": 350
},
{
"epoch": 0.7468879668049793,
"grad_norm": 14.819519415062294,
"learning_rate": 4.67057608236567e-07,
"logits/chosen": -0.75390625,
"logits/rejected": -0.69140625,
"logps/chosen": -536.0,
"logps/rejected": -696.0,
"loss": 0.3681,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.609375,
"rewards/margins": 1.7109375,
"rewards/rejected": -4.3125,
"step": 360
},
{
"epoch": 0.7676348547717843,
"grad_norm": 10.366281203736689,
"learning_rate": 4.6399935315678893e-07,
"logits/chosen": -0.408203125,
"logits/rejected": -0.57421875,
"logps/chosen": -660.0,
"logps/rejected": -700.0,
"loss": 0.3893,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.4375,
"rewards/margins": 1.2578125,
"rewards/rejected": -4.6875,
"step": 370
},
{
"epoch": 0.7883817427385892,
"grad_norm": 17.40885052546667,
"learning_rate": 4.608163205110447e-07,
"logits/chosen": -0.546875,
"logits/rejected": -0.515625,
"logps/chosen": -620.0,
"logps/rejected": -592.0,
"loss": 0.4214,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.90625,
"rewards/margins": 0.26171875,
"rewards/rejected": -3.15625,
"step": 380
},
{
"epoch": 0.8091286307053942,
"grad_norm": 12.056546567751306,
"learning_rate": 4.5751036624450445e-07,
"logits/chosen": -0.8515625,
"logits/rejected": -0.7890625,
"logps/chosen": -636.0,
"logps/rejected": -660.0,
"loss": 0.4062,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.125,
"rewards/margins": 0.84375,
"rewards/rejected": -2.96875,
"step": 390
},
{
"epoch": 0.8298755186721992,
"grad_norm": 19.517095745745852,
"learning_rate": 4.540834179748012e-07,
"logits/chosen": -0.41015625,
"logits/rejected": -0.439453125,
"logps/chosen": -376.0,
"logps/rejected": -588.0,
"loss": 0.3903,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.25,
"rewards/margins": 1.5859375,
"rewards/rejected": -2.84375,
"step": 400
},
{
"epoch": 0.8506224066390041,
"grad_norm": 11.340627516184107,
"learning_rate": 4.5053747386808564e-07,
"logits/chosen": -0.5703125,
"logits/rejected": -0.61328125,
"logps/chosen": -556.0,
"logps/rejected": -668.0,
"loss": 0.3894,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.171875,
"rewards/margins": 1.09375,
"rewards/rejected": -3.265625,
"step": 410
},
{
"epoch": 0.8713692946058091,
"grad_norm": 21.853682876132318,
"learning_rate": 4.4687460147394706e-07,
"logits/chosen": -0.73828125,
"logits/rejected": -0.76953125,
"logps/chosen": -612.0,
"logps/rejected": -820.0,
"loss": 0.3944,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.765625,
"rewards/margins": 2.0,
"rewards/rejected": -3.78125,
"step": 420
},
{
"epoch": 0.8921161825726142,
"grad_norm": 9.993671586744828,
"learning_rate": 4.4309693651987726e-07,
"logits/chosen": -0.87109375,
"logits/rejected": -0.796875,
"logps/chosen": -776.0,
"logps/rejected": -804.0,
"loss": 0.3534,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.640625,
"rewards/margins": 1.65625,
"rewards/rejected": -4.28125,
"step": 430
},
{
"epoch": 0.9128630705394191,
"grad_norm": 10.792350406013442,
"learning_rate": 4.3920668166598273e-07,
"logits/chosen": -0.7421875,
"logits/rejected": -0.9140625,
"logps/chosen": -612.0,
"logps/rejected": -804.0,
"loss": 0.321,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.625,
"rewards/margins": 2.328125,
"rewards/rejected": -4.9375,
"step": 440
},
{
"epoch": 0.9336099585062241,
"grad_norm": 17.038293447631034,
"learning_rate": 4.352061052206695e-07,
"logits/chosen": -0.828125,
"logits/rejected": -0.859375,
"logps/chosen": -488.0,
"logps/rejected": -680.0,
"loss": 0.3409,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.3125,
"rewards/margins": 1.78125,
"rewards/rejected": -4.09375,
"step": 450
},
{
"epoch": 0.9543568464730291,
"grad_norm": 13.168279519384841,
"learning_rate": 4.3109753981805045e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.62890625,
"logps/chosen": -660.0,
"logps/rejected": -760.0,
"loss": 0.3341,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8828125,
"rewards/margins": 2.6875,
"rewards/rejected": -4.5625,
"step": 460
},
{
"epoch": 0.975103734439834,
"grad_norm": 36.8656206464395,
"learning_rate": 4.2688338105784584e-07,
"logits/chosen": -0.55078125,
"logits/rejected": -0.66796875,
"logps/chosen": -736.0,
"logps/rejected": -764.0,
"loss": 0.339,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.515625,
"rewards/margins": 2.234375,
"rewards/rejected": -4.75,
"step": 470
},
{
"epoch": 0.995850622406639,
"grad_norm": 14.773211954922196,
"learning_rate": 4.2256608610857014e-07,
"logits/chosen": -0.86328125,
"logits/rejected": -0.78515625,
"logps/chosen": -564.0,
"logps/rejected": -780.0,
"loss": 0.3697,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.03125,
"rewards/margins": 2.671875,
"rewards/rejected": -4.71875,
"step": 480
},
{
"epoch": 1.016597510373444,
"grad_norm": 15.162967960468519,
"learning_rate": 4.181481722748197e-07,
"logits/chosen": -0.78125,
"logits/rejected": -0.84765625,
"logps/chosen": -476.0,
"logps/rejected": -656.0,
"loss": 0.2396,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.921875,
"rewards/margins": 2.109375,
"rewards/rejected": -4.03125,
"step": 490
},
{
"epoch": 1.037344398340249,
"grad_norm": 19.132102905117456,
"learning_rate": 4.136322155294968e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.80859375,
"logps/chosen": -688.0,
"logps/rejected": -896.0,
"loss": 0.1539,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.65625,
"rewards/margins": 2.796875,
"rewards/rejected": -5.4375,
"step": 500
},
{
"epoch": 1.058091286307054,
"grad_norm": 8.59971354606536,
"learning_rate": 4.090208490118253e-07,
"logits/chosen": -0.73828125,
"logits/rejected": -0.8046875,
"logps/chosen": -796.0,
"logps/rejected": -920.0,
"loss": 0.1643,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.609375,
"rewards/margins": 2.78125,
"rewards/rejected": -5.40625,
"step": 510
},
{
"epoch": 1.0788381742738589,
"grad_norm": 16.6448813170998,
"learning_rate": 4.0431676149203457e-07,
"logits/chosen": -0.4375,
"logits/rejected": -0.49609375,
"logps/chosen": -454.0,
"logps/rejected": -912.0,
"loss": 0.1488,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.390625,
"rewards/margins": 4.53125,
"rewards/rejected": -6.90625,
"step": 520
},
{
"epoch": 1.099585062240664,
"grad_norm": 9.144636292899186,
"learning_rate": 3.995226958036058e-07,
"logits/chosen": -0.42578125,
"logits/rejected": -0.56640625,
"logps/chosen": -712.0,
"logps/rejected": -1012.0,
"loss": 0.1644,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.859375,
"rewards/margins": 3.53125,
"rewards/rejected": -7.375,
"step": 530
},
{
"epoch": 1.120331950207469,
"grad_norm": 14.589338073597446,
"learning_rate": 3.9464144724399605e-07,
"logits/chosen": -0.51171875,
"logits/rejected": -0.421875,
"logps/chosen": -680.0,
"logps/rejected": -1040.0,
"loss": 0.1382,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.5625,
"rewards/margins": 3.6875,
"rewards/rejected": -6.25,
"step": 540
},
{
"epoch": 1.1410788381742738,
"grad_norm": 7.897607849867475,
"learning_rate": 3.896758619447714e-07,
"logits/chosen": -0.75,
"logits/rejected": -0.6796875,
"logps/chosen": -552.0,
"logps/rejected": -888.0,
"loss": 0.1505,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.4375,
"rewards/margins": 3.96875,
"rewards/rejected": -6.40625,
"step": 550
},
{
"epoch": 1.161825726141079,
"grad_norm": 10.375941833434622,
"learning_rate": 3.846288352121003e-07,
"logits/chosen": -0.76953125,
"logits/rejected": -0.75390625,
"logps/chosen": -668.0,
"logps/rejected": -892.0,
"loss": 0.1659,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.484375,
"rewards/margins": 2.515625,
"rewards/rejected": -6.0,
"step": 560
},
{
"epoch": 1.1825726141078838,
"grad_norm": 9.709492497216598,
"learning_rate": 3.795033098385744e-07,
"logits/chosen": -0.62109375,
"logits/rejected": -0.61328125,
"logps/chosen": -612.0,
"logps/rejected": -952.0,
"loss": 0.1486,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.1875,
"rewards/margins": 2.953125,
"rewards/rejected": -6.125,
"step": 570
},
{
"epoch": 1.2033195020746887,
"grad_norm": 18.021248024950815,
"learning_rate": 3.7430227438734086e-07,
"logits/chosen": -0.76171875,
"logits/rejected": -0.7109375,
"logps/chosen": -872.0,
"logps/rejected": -1168.0,
"loss": 0.128,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -4.28125,
"rewards/margins": 3.375,
"rewards/rejected": -7.625,
"step": 580
},
{
"epoch": 1.2240663900414939,
"grad_norm": 10.12277474430289,
"learning_rate": 3.690287614495481e-07,
"logits/chosen": -0.875,
"logits/rejected": -0.83984375,
"logps/chosen": -760.0,
"logps/rejected": -1216.0,
"loss": 0.1163,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.0625,
"rewards/margins": 4.375,
"rewards/rejected": -8.4375,
"step": 590
},
{
"epoch": 1.2448132780082988,
"grad_norm": 16.358228454815244,
"learning_rate": 3.6368584587611854e-07,
"logits/chosen": -0.70703125,
"logits/rejected": -0.75390625,
"logps/chosen": -596.0,
"logps/rejected": -1012.0,
"loss": 0.1356,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.84375,
"rewards/margins": 4.1875,
"rewards/rejected": -7.03125,
"step": 600
},
{
"epoch": 1.2655601659751037,
"grad_norm": 13.263894273990404,
"learning_rate": 3.582766429848818e-07,
"logits/chosen": -0.91796875,
"logits/rejected": -0.8671875,
"logps/chosen": -740.0,
"logps/rejected": -952.0,
"loss": 0.1465,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.75,
"rewards/margins": 2.828125,
"rewards/rejected": -6.5625,
"step": 610
},
{
"epoch": 1.2863070539419086,
"grad_norm": 13.824768954350477,
"learning_rate": 3.528043067441123e-07,
"logits/chosen": -0.5234375,
"logits/rejected": -0.5703125,
"logps/chosen": -470.0,
"logps/rejected": -772.0,
"loss": 0.1661,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.265625,
"rewards/margins": 2.6875,
"rewards/rejected": -4.96875,
"step": 620
},
{
"epoch": 1.3070539419087137,
"grad_norm": 11.669758162071892,
"learning_rate": 3.472720279335305e-07,
"logits/chosen": -0.8828125,
"logits/rejected": -0.8359375,
"logps/chosen": -756.0,
"logps/rejected": -1056.0,
"loss": 0.1428,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.578125,
"rewards/margins": 3.46875,
"rewards/rejected": -7.03125,
"step": 630
},
{
"epoch": 1.3278008298755186,
"grad_norm": 11.36461562294381,
"learning_rate": 3.4168303228384097e-07,
"logits/chosen": -0.73046875,
"logits/rejected": -0.82421875,
"logps/chosen": -740.0,
"logps/rejected": -1056.0,
"loss": 0.1497,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.625,
"rewards/margins": 2.90625,
"rewards/rejected": -6.53125,
"step": 640
},
{
"epoch": 1.3485477178423237,
"grad_norm": 10.722964833155572,
"learning_rate": 3.36040578595891e-07,
"logits/chosen": -0.8046875,
"logits/rejected": -0.8828125,
"logps/chosen": -696.0,
"logps/rejected": -948.0,
"loss": 0.1362,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.828125,
"rewards/margins": 2.828125,
"rewards/rejected": -5.65625,
"step": 650
},
{
"epoch": 1.3692946058091287,
"grad_norm": 9.800531956736956,
"learning_rate": 3.303479568405467e-07,
"logits/chosen": -0.7734375,
"logits/rejected": -0.86328125,
"logps/chosen": -700.0,
"logps/rejected": -848.0,
"loss": 0.1266,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.140625,
"rewards/margins": 2.1875,
"rewards/rejected": -5.34375,
"step": 660
},
{
"epoch": 1.3900414937759336,
"grad_norm": 10.691535240032913,
"learning_rate": 3.246084862403949e-07,
"logits/chosen": -0.65625,
"logits/rejected": -0.6875,
"logps/chosen": -800.0,
"logps/rejected": -1224.0,
"loss": 0.1275,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.421875,
"rewards/margins": 5.125,
"rewards/rejected": -8.5,
"step": 670
},
{
"epoch": 1.4107883817427385,
"grad_norm": 19.2843124932583,
"learning_rate": 3.188255133343896e-07,
"logits/chosen": -0.87109375,
"logits/rejected": -0.8203125,
"logps/chosen": -804.0,
"logps/rejected": -1120.0,
"loss": 0.1339,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.59375,
"rewards/margins": 2.859375,
"rewards/rejected": -6.4375,
"step": 680
},
{
"epoch": 1.4315352697095436,
"grad_norm": 13.391310848340655,
"learning_rate": 3.1300241002656964e-07,
"logits/chosen": -0.7578125,
"logits/rejected": -0.75390625,
"logps/chosen": -768.0,
"logps/rejected": -1200.0,
"loss": 0.1261,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.15625,
"rewards/margins": 5.0,
"rewards/rejected": -8.1875,
"step": 690
},
{
"epoch": 1.4522821576763485,
"grad_norm": 12.375177940845006,
"learning_rate": 3.071425716199882e-07,
"logits/chosen": -0.92578125,
"logits/rejected": -0.9765625,
"logps/chosen": -624.0,
"logps/rejected": -1128.0,
"loss": 0.1246,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.84375,
"rewards/margins": 4.96875,
"rewards/rejected": -7.8125,
"step": 700
},
{
"epoch": 1.4730290456431536,
"grad_norm": 14.850198748586445,
"learning_rate": 3.0124941483699753e-07,
"logits/chosen": -0.8671875,
"logits/rejected": -0.87109375,
"logps/chosen": -804.0,
"logps/rejected": -1096.0,
"loss": 0.1065,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.21875,
"rewards/margins": 3.734375,
"rewards/rejected": -7.9375,
"step": 710
},
{
"epoch": 1.4937759336099585,
"grad_norm": 12.507368273272412,
"learning_rate": 2.953263758270459e-07,
"logits/chosen": -0.66796875,
"logits/rejected": -0.796875,
"logps/chosen": -564.0,
"logps/rejected": -788.0,
"loss": 0.1412,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.5625,
"rewards/margins": 2.84375,
"rewards/rejected": -5.375,
"step": 720
},
{
"epoch": 1.5145228215767634,
"grad_norm": 12.416603553859566,
"learning_rate": 2.8937690816314577e-07,
"logits/chosen": -0.91015625,
"logits/rejected": -0.953125,
"logps/chosen": -648.0,
"logps/rejected": -996.0,
"loss": 0.0877,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.25,
"rewards/margins": 3.46875,
"rewards/rejected": -6.71875,
"step": 730
},
{
"epoch": 1.5352697095435683,
"grad_norm": 30.476655601465435,
"learning_rate": 2.834044808281841e-07,
"logits/chosen": -0.76953125,
"logits/rejected": -0.8515625,
"logps/chosen": -748.0,
"logps/rejected": -1168.0,
"loss": 0.1211,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.609375,
"rewards/margins": 4.78125,
"rewards/rejected": -8.375,
"step": 740
},
{
"epoch": 1.5560165975103735,
"grad_norm": 16.239887747462344,
"learning_rate": 2.774125761922463e-07,
"logits/chosen": -0.7734375,
"logits/rejected": -0.78515625,
"logps/chosen": -588.0,
"logps/rejected": -972.0,
"loss": 0.1176,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.9375,
"rewards/margins": 4.15625,
"rewards/rejected": -7.09375,
"step": 750
},
{
"epoch": 1.5767634854771784,
"grad_norm": 17.482193111874572,
"learning_rate": 2.714046879821358e-07,
"logits/chosen": -0.73046875,
"logits/rejected": -0.8515625,
"logps/chosen": -700.0,
"logps/rejected": -1224.0,
"loss": 0.1128,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.359375,
"rewards/margins": 4.78125,
"rewards/rejected": -8.125,
"step": 760
},
{
"epoch": 1.5975103734439835,
"grad_norm": 8.31205194284139,
"learning_rate": 2.653843192442699e-07,
"logits/chosen": -0.70703125,
"logits/rejected": -0.7578125,
"logps/chosen": -724.0,
"logps/rejected": -1176.0,
"loss": 0.1338,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.609375,
"rewards/margins": 5.15625,
"rewards/rejected": -8.75,
"step": 770
},
{
"epoch": 1.6182572614107884,
"grad_norm": 10.622352189872272,
"learning_rate": 2.5935498030214397e-07,
"logits/chosen": -0.81640625,
"logits/rejected": -0.8125,
"logps/chosen": -756.0,
"logps/rejected": -1080.0,
"loss": 0.0991,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.609375,
"rewards/margins": 3.21875,
"rewards/rejected": -6.8125,
"step": 780
},
{
"epoch": 1.6390041493775933,
"grad_norm": 9.417824682877646,
"learning_rate": 2.533201867095504e-07,
"logits/chosen": -0.6953125,
"logits/rejected": -0.76171875,
"logps/chosen": -840.0,
"logps/rejected": -1312.0,
"loss": 0.1458,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.625,
"rewards/margins": 5.59375,
"rewards/rejected": -10.25,
"step": 790
},
{
"epoch": 1.6597510373443982,
"grad_norm": 11.234878509540781,
"learning_rate": 2.472834572007493e-07,
"logits/chosen": -0.921875,
"logits/rejected": -0.8671875,
"logps/chosen": -608.0,
"logps/rejected": -1048.0,
"loss": 0.1348,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.296875,
"rewards/margins": 4.5625,
"rewards/rejected": -7.84375,
"step": 800
},
{
"epoch": 1.6804979253112033,
"grad_norm": 14.091480064608236,
"learning_rate": 2.4124831163878427e-07,
"logits/chosen": -0.81640625,
"logits/rejected": -0.8515625,
"logps/chosen": -764.0,
"logps/rejected": -1240.0,
"loss": 0.1137,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.0,
"rewards/margins": 4.59375,
"rewards/rejected": -8.625,
"step": 810
},
{
"epoch": 1.7012448132780082,
"grad_norm": 6.963974139105738,
"learning_rate": 2.3521826896313965e-07,
"logits/chosen": -0.9375,
"logits/rejected": -0.9609375,
"logps/chosen": -840.0,
"logps/rejected": -1384.0,
"loss": 0.1082,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.90625,
"rewards/margins": 5.5,
"rewards/rejected": -10.375,
"step": 820
},
{
"epoch": 1.7219917012448134,
"grad_norm": 15.006582270008211,
"learning_rate": 2.2919684513793704e-07,
"logits/chosen": -0.78515625,
"logits/rejected": -0.8515625,
"logps/chosen": -764.0,
"logps/rejected": -1240.0,
"loss": 0.0973,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.3125,
"rewards/margins": 4.875,
"rewards/rejected": -9.1875,
"step": 830
},
{
"epoch": 1.7427385892116183,
"grad_norm": 7.046277080292156,
"learning_rate": 2.2318755110186602e-07,
"logits/chosen": -0.69140625,
"logits/rejected": -0.77734375,
"logps/chosen": -604.0,
"logps/rejected": -996.0,
"loss": 0.1284,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.03125,
"rewards/margins": 3.765625,
"rewards/rejected": -6.8125,
"step": 840
},
{
"epoch": 1.7634854771784232,
"grad_norm": 11.157911173958768,
"learning_rate": 2.171938907210457e-07,
"logits/chosen": -0.77734375,
"logits/rejected": -0.72265625,
"logps/chosen": -712.0,
"logps/rejected": -1120.0,
"loss": 0.0941,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.0625,
"rewards/margins": 4.21875,
"rewards/rejected": -8.25,
"step": 850
},
{
"epoch": 1.784232365145228,
"grad_norm": 7.821478727431665,
"learning_rate": 2.1121935874600914e-07,
"logits/chosen": -0.58203125,
"logits/rejected": -0.63671875,
"logps/chosen": -712.0,
"logps/rejected": -1064.0,
"loss": 0.0788,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.765625,
"rewards/margins": 3.90625,
"rewards/rejected": -7.65625,
"step": 860
},
{
"epoch": 1.8049792531120332,
"grad_norm": 19.017867203624398,
"learning_rate": 2.052674387740039e-07,
"logits/chosen": -0.71875,
"logits/rejected": -0.80859375,
"logps/chosen": -732.0,
"logps/rejected": -1200.0,
"loss": 0.1237,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.46875,
"rewards/margins": 4.875,
"rewards/rejected": -9.375,
"step": 870
},
{
"epoch": 1.8257261410788381,
"grad_norm": 8.469359704887415,
"learning_rate": 1.9934160121779511e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.64453125,
"logps/chosen": -768.0,
"logps/rejected": -1264.0,
"loss": 0.095,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.703125,
"rewards/margins": 4.9375,
"rewards/rejected": -8.625,
"step": 880
},
{
"epoch": 1.8464730290456433,
"grad_norm": 14.658113601459338,
"learning_rate": 1.9344530128215644e-07,
"logits/chosen": -0.78515625,
"logits/rejected": -0.890625,
"logps/chosen": -816.0,
"logps/rejected": -1224.0,
"loss": 0.116,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.546875,
"rewards/margins": 5.09375,
"rewards/rejected": -8.625,
"step": 890
},
{
"epoch": 1.8672199170124482,
"grad_norm": 26.19755259819255,
"learning_rate": 1.8758197694922812e-07,
"logits/chosen": -0.609375,
"logits/rejected": -0.69140625,
"logps/chosen": -816.0,
"logps/rejected": -1184.0,
"loss": 0.1326,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.71875,
"rewards/margins": 4.0625,
"rewards/rejected": -8.75,
"step": 900
},
{
"epoch": 1.887966804979253,
"grad_norm": 14.081930494469324,
"learning_rate": 1.8175504697391728e-07,
"logits/chosen": -0.79296875,
"logits/rejected": -0.76171875,
"logps/chosen": -804.0,
"logps/rejected": -1168.0,
"loss": 0.0909,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.34375,
"rewards/margins": 4.65625,
"rewards/rejected": -9.0,
"step": 910
},
{
"epoch": 1.908713692946058,
"grad_norm": 8.117586964004555,
"learning_rate": 1.7596790889050907e-07,
"logits/chosen": -0.8671875,
"logits/rejected": -0.8671875,
"logps/chosen": -652.0,
"logps/rejected": -1056.0,
"loss": 0.1183,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.34375,
"rewards/margins": 4.15625,
"rewards/rejected": -7.5,
"step": 920
},
{
"epoch": 1.929460580912863,
"grad_norm": 14.392383053670414,
"learning_rate": 1.702239370316515e-07,
"logits/chosen": -0.78515625,
"logits/rejected": -0.8671875,
"logps/chosen": -768.0,
"logps/rejected": -1200.0,
"loss": 0.1086,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.78125,
"rewards/margins": 4.78125,
"rewards/rejected": -8.5625,
"step": 930
},
{
"epoch": 1.950207468879668,
"grad_norm": 11.827232109289305,
"learning_rate": 1.645264805608674e-07,
"logits/chosen": -0.83984375,
"logits/rejected": -0.80078125,
"logps/chosen": -872.0,
"logps/rejected": -1304.0,
"loss": 0.0874,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.125,
"rewards/margins": 4.34375,
"rewards/rejected": -9.5,
"step": 940
},
{
"epoch": 1.9709543568464731,
"grad_norm": 17.849319216864608,
"learning_rate": 1.58878861519743e-07,
"logits/chosen": -0.84375,
"logits/rejected": -0.8828125,
"logps/chosen": -768.0,
"logps/rejected": -1144.0,
"loss": 0.1201,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.46875,
"rewards/margins": 3.640625,
"rewards/rejected": -8.125,
"step": 950
},
{
"epoch": 1.991701244813278,
"grad_norm": 17.73316536429117,
"learning_rate": 1.5328437289093015e-07,
"logits/chosen": -0.90234375,
"logits/rejected": -0.87890625,
"logps/chosen": -772.0,
"logps/rejected": -1336.0,
"loss": 0.0932,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.90625,
"rewards/margins": 5.0,
"rewards/rejected": -9.875,
"step": 960
},
{
"epoch": 2.012448132780083,
"grad_norm": 1.7808893883270032,
"learning_rate": 1.4774627667809223e-07,
"logits/chosen": -1.1015625,
"logits/rejected": -1.09375,
"logps/chosen": -808.0,
"logps/rejected": -1184.0,
"loss": 0.0532,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.40625,
"rewards/margins": 4.5625,
"rewards/rejected": -9.0,
"step": 970
},
{
"epoch": 2.033195020746888,
"grad_norm": 13.54516810123686,
"learning_rate": 1.4226780200391267e-07,
"logits/chosen": -0.330078125,
"logits/rejected": -0.474609375,
"logps/chosen": -884.0,
"logps/rejected": -1552.0,
"loss": 0.026,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.96875,
"rewards/margins": 5.53125,
"rewards/rejected": -11.5,
"step": 980
},
{
"epoch": 2.0539419087136928,
"grad_norm": 1.8805874765997683,
"learning_rate": 1.3685214322727596e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.84765625,
"logps/chosen": -948.0,
"logps/rejected": -1584.0,
"loss": 0.0162,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.40625,
"rewards/margins": 6.46875,
"rewards/rejected": -12.875,
"step": 990
},
{
"epoch": 2.074688796680498,
"grad_norm": 1.3911921785236845,
"learning_rate": 1.3150245808071854e-07,
"logits/chosen": -0.75390625,
"logits/rejected": -0.80078125,
"logps/chosen": -956.0,
"logps/rejected": -1600.0,
"loss": 0.0125,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.53125,
"rewards/margins": 6.53125,
"rewards/rejected": -13.0625,
"step": 1000
},
{
"epoch": 2.095435684647303,
"grad_norm": 2.437460163435022,
"learning_rate": 1.2622186582923566e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.62109375,
"logps/chosen": -1136.0,
"logps/rejected": -1608.0,
"loss": 0.0135,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.40625,
"rewards/margins": 5.46875,
"rewards/rejected": -11.875,
"step": 1010
},
{
"epoch": 2.116182572614108,
"grad_norm": 3.5023691171107245,
"learning_rate": 1.2101344545151713e-07,
"logits/chosen": -0.63671875,
"logits/rejected": -0.609375,
"logps/chosen": -844.0,
"logps/rejected": -1552.0,
"loss": 0.0142,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.34375,
"rewards/margins": 7.15625,
"rewards/rejected": -12.5,
"step": 1020
},
{
"epoch": 2.136929460580913,
"grad_norm": 4.086843498233057,
"learning_rate": 1.1588023384467335e-07,
"logits/chosen": -0.7578125,
"logits/rejected": -0.81640625,
"logps/chosen": -1112.0,
"logps/rejected": -1848.0,
"loss": 0.0212,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.6875,
"rewards/margins": 7.84375,
"rewards/rejected": -15.5625,
"step": 1030
},
{
"epoch": 2.1576763485477177,
"grad_norm": 6.577269547407749,
"learning_rate": 1.1082522405349834e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.61328125,
"logps/chosen": -1080.0,
"logps/rejected": -1760.0,
"loss": 0.0151,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.125,
"rewards/margins": 7.53125,
"rewards/rejected": -14.625,
"step": 1040
},
{
"epoch": 2.1784232365145226,
"grad_norm": 3.091476943089419,
"learning_rate": 1.0585136352530172e-07,
"logits/chosen": -0.92578125,
"logits/rejected": -0.9375,
"logps/chosen": -1080.0,
"logps/rejected": -1600.0,
"loss": 0.0155,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.03125,
"rewards/margins": 5.28125,
"rewards/rejected": -12.3125,
"step": 1050
},
{
"epoch": 2.199170124481328,
"grad_norm": 6.810330147221256,
"learning_rate": 1.0096155239132675e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.63671875,
"logps/chosen": -776.0,
"logps/rejected": -1272.0,
"loss": 0.0206,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.3125,
"rewards/margins": 5.5,
"rewards/rejected": -10.875,
"step": 1060
},
{
"epoch": 2.219917012448133,
"grad_norm": 1.5020933483656527,
"learning_rate": 9.615864177575836e-08,
"logits/chosen": -0.7734375,
"logits/rejected": -0.81640625,
"logps/chosen": -1384.0,
"logps/rejected": -1984.0,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.78125,
"rewards/margins": 8.5,
"rewards/rejected": -16.25,
"step": 1070
},
{
"epoch": 2.240663900414938,
"grad_norm": 1.8136977524511886,
"learning_rate": 9.144543213330493e-08,
"logits/chosen": -0.85546875,
"logits/rejected": -0.8984375,
"logps/chosen": -1072.0,
"logps/rejected": -1904.0,
"loss": 0.0115,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.90625,
"rewards/margins": 8.5625,
"rewards/rejected": -15.5,
"step": 1080
},
{
"epoch": 2.2614107883817427,
"grad_norm": 1.1956894404186822,
"learning_rate": 8.682467161632508e-08,
"logits/chosen": -0.7734375,
"logits/rejected": -0.859375,
"logps/chosen": -956.0,
"logps/rejected": -1688.0,
"loss": 0.0129,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.0625,
"rewards/margins": 7.3125,
"rewards/rejected": -14.375,
"step": 1090
},
{
"epoch": 2.2821576763485476,
"grad_norm": 13.368765403015455,
"learning_rate": 8.229905447244942e-08,
"logits/chosen": -0.76171875,
"logits/rejected": -0.796875,
"logps/chosen": -1304.0,
"logps/rejected": -1776.0,
"loss": 0.0196,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.59375,
"rewards/margins": 6.78125,
"rewards/rejected": -14.375,
"step": 1100
},
{
"epoch": 2.3029045643153525,
"grad_norm": 2.690805031469103,
"learning_rate": 7.787121947363393e-08,
"logits/chosen": -0.66796875,
"logits/rejected": -0.7109375,
"logps/chosen": -1224.0,
"logps/rejected": -1952.0,
"loss": 0.0219,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.25,
"rewards/margins": 7.59375,
"rewards/rejected": -15.875,
"step": 1110
},
{
"epoch": 2.323651452282158,
"grad_norm": 2.4290925851945926,
"learning_rate": 7.354374837755919e-08,
"logits/chosen": -0.69921875,
"logits/rejected": -0.7578125,
"logps/chosen": -1320.0,
"logps/rejected": -2080.0,
"loss": 0.0158,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.0625,
"rewards/margins": 8.375,
"rewards/rejected": -17.5,
"step": 1120
},
{
"epoch": 2.3443983402489628,
"grad_norm": 1.5008186930985454,
"learning_rate": 6.931916442227335e-08,
"logits/chosen": -0.78125,
"logits/rejected": -0.77734375,
"logps/chosen": -1104.0,
"logps/rejected": -1664.0,
"loss": 0.0111,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.8125,
"rewards/margins": 6.125,
"rewards/rejected": -13.9375,
"step": 1130
},
{
"epoch": 2.3651452282157677,
"grad_norm": 0.4121106095749944,
"learning_rate": 6.519993085495622e-08,
"logits/chosen": -0.6171875,
"logits/rejected": -0.68359375,
"logps/chosen": -1048.0,
"logps/rejected": -1720.0,
"loss": 0.0166,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.28125,
"rewards/margins": 6.78125,
"rewards/rejected": -13.0625,
"step": 1140
},
{
"epoch": 2.3858921161825726,
"grad_norm": 2.9431081086738957,
"learning_rate": 6.118844949566293e-08,
"logits/chosen": -0.66015625,
"logits/rejected": -0.8046875,
"logps/chosen": -1012.0,
"logps/rejected": -1800.0,
"loss": 0.0097,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.78125,
"rewards/margins": 8.5,
"rewards/rejected": -15.25,
"step": 1150
},
{
"epoch": 2.4066390041493775,
"grad_norm": 3.2000573542150286,
"learning_rate": 5.728705933688349e-08,
"logits/chosen": -0.71875,
"logits/rejected": -0.69140625,
"logps/chosen": -1088.0,
"logps/rejected": -1936.0,
"loss": 0.0179,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.71875,
"rewards/margins": 8.6875,
"rewards/rejected": -16.375,
"step": 1160
},
{
"epoch": 2.4273858921161824,
"grad_norm": 1.0558871880279557,
"learning_rate": 5.3498035179736475e-08,
"logits/chosen": -0.546875,
"logits/rejected": -0.64453125,
"logps/chosen": -856.0,
"logps/rejected": -1600.0,
"loss": 0.0119,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.4375,
"rewards/margins": 7.3125,
"rewards/rejected": -13.75,
"step": 1170
},
{
"epoch": 2.4481327800829877,
"grad_norm": 10.613994074318533,
"learning_rate": 4.98235863075899e-08,
"logits/chosen": -0.6796875,
"logits/rejected": -0.70703125,
"logps/chosen": -1024.0,
"logps/rejected": -1768.0,
"loss": 0.0164,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.125,
"rewards/margins": 7.59375,
"rewards/rejected": -14.6875,
"step": 1180
},
{
"epoch": 2.4688796680497926,
"grad_norm": 1.5107625022692397,
"learning_rate": 4.626585519788476e-08,
"logits/chosen": -0.71484375,
"logits/rejected": -0.72265625,
"logps/chosen": -1040.0,
"logps/rejected": -2040.0,
"loss": 0.0124,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.75,
"rewards/margins": 10.0,
"rewards/rejected": -16.75,
"step": 1190
},
{
"epoch": 2.4896265560165975,
"grad_norm": 0.6379884298790988,
"learning_rate": 4.2826916272911154e-08,
"logits/chosen": -0.490234375,
"logits/rejected": -0.58203125,
"logps/chosen": -1136.0,
"logps/rejected": -1672.0,
"loss": 0.0143,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -7.625,
"rewards/margins": 6.5,
"rewards/rejected": -14.125,
"step": 1200
},
{
"epoch": 2.5103734439834025,
"grad_norm": 1.1839781270784333,
"learning_rate": 3.950877469026523e-08,
"logits/chosen": -0.55859375,
"logits/rejected": -0.6953125,
"logps/chosen": -1248.0,
"logps/rejected": -2144.0,
"loss": 0.0132,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.6875,
"rewards/margins": 10.0,
"rewards/rejected": -18.625,
"step": 1210
},
{
"epoch": 2.5311203319502074,
"grad_norm": 14.154020819707466,
"learning_rate": 3.631336517369313e-08,
"logits/chosen": -0.75390625,
"logits/rejected": -0.86328125,
"logps/chosen": -968.0,
"logps/rejected": -1704.0,
"loss": 0.0174,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.65625,
"rewards/margins": 7.3125,
"rewards/rejected": -14.0,
"step": 1220
},
{
"epoch": 2.5518672199170123,
"grad_norm": 4.473861321502488,
"learning_rate": 3.3242550885002805e-08,
"logits/chosen": -0.71875,
"logits/rejected": -0.8046875,
"logps/chosen": -1336.0,
"logps/rejected": -1856.0,
"loss": 0.0088,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.96875,
"rewards/margins": 7.25,
"rewards/rejected": -15.1875,
"step": 1230
},
{
"epoch": 2.572614107883817,
"grad_norm": 0.8764104869307379,
"learning_rate": 3.029812233770215e-08,
"logits/chosen": -0.71484375,
"logits/rejected": -0.765625,
"logps/chosen": -836.0,
"logps/rejected": -1504.0,
"loss": 0.0114,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.84375,
"rewards/margins": 6.90625,
"rewards/rejected": -12.75,
"step": 1240
},
{
"epoch": 2.5933609958506225,
"grad_norm": 0.8441509449443175,
"learning_rate": 2.74817963529958e-08,
"logits/chosen": -0.490234375,
"logits/rejected": -0.52734375,
"logps/chosen": -1072.0,
"logps/rejected": -1880.0,
"loss": 0.0117,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.75,
"rewards/margins": 7.9375,
"rewards/rejected": -15.6875,
"step": 1250
},
{
"epoch": 2.6141078838174274,
"grad_norm": 2.8897995526214415,
"learning_rate": 2.479521505875079e-08,
"logits/chosen": -0.71484375,
"logits/rejected": -0.7734375,
"logps/chosen": -1192.0,
"logps/rejected": -1808.0,
"loss": 0.0095,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.0,
"rewards/margins": 7.0,
"rewards/rejected": -15.0,
"step": 1260
},
{
"epoch": 2.6348547717842323,
"grad_norm": 2.0317827036066234,
"learning_rate": 2.223994493201342e-08,
"logits/chosen": -0.79296875,
"logits/rejected": -0.80078125,
"logps/chosen": -1144.0,
"logps/rejected": -1856.0,
"loss": 0.0083,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.0625,
"rewards/margins": 7.71875,
"rewards/rejected": -15.75,
"step": 1270
},
{
"epoch": 2.6556016597510372,
"grad_norm": 3.365990349119932,
"learning_rate": 1.9817475885636868e-08,
"logits/chosen": -0.70703125,
"logits/rejected": -0.78125,
"logps/chosen": -1064.0,
"logps/rejected": -1824.0,
"loss": 0.0137,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.875,
"rewards/margins": 7.75,
"rewards/rejected": -14.625,
"step": 1280
},
{
"epoch": 2.6763485477178426,
"grad_norm": 1.6615356719262115,
"learning_rate": 1.7529220399550376e-08,
"logits/chosen": -0.58984375,
"logits/rejected": -0.7734375,
"logps/chosen": -1280.0,
"logps/rejected": -2336.0,
"loss": 0.0101,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.4375,
"rewards/margins": 10.1875,
"rewards/rejected": -19.625,
"step": 1290
},
{
"epoch": 2.6970954356846475,
"grad_norm": 1.1273270140301521,
"learning_rate": 1.5376512697178713e-08,
"logits/chosen": -0.58203125,
"logits/rejected": -0.6875,
"logps/chosen": -1056.0,
"logps/rejected": -1752.0,
"loss": 0.0156,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.34375,
"rewards/margins": 7.75,
"rewards/rejected": -15.125,
"step": 1300
},
{
"epoch": 2.7178423236514524,
"grad_norm": 7.344349531123166,
"learning_rate": 1.3360607967490307e-08,
"logits/chosen": -0.81640625,
"logits/rejected": -0.83203125,
"logps/chosen": -1416.0,
"logps/rejected": -2192.0,
"loss": 0.0125,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.25,
"rewards/margins": 8.625,
"rewards/rejected": -17.875,
"step": 1310
},
{
"epoch": 2.7385892116182573,
"grad_norm": 0.7630154530022901,
"learning_rate": 1.1482681633128738e-08,
"logits/chosen": -0.58984375,
"logits/rejected": -0.71875,
"logps/chosen": -1008.0,
"logps/rejected": -1920.0,
"loss": 0.0145,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.3125,
"rewards/margins": 9.5,
"rewards/rejected": -16.75,
"step": 1320
},
{
"epoch": 2.759336099585062,
"grad_norm": 1.063282018361314,
"learning_rate": 9.743828665053466e-09,
"logits/chosen": -0.39453125,
"logits/rejected": -0.54296875,
"logps/chosen": -1088.0,
"logps/rejected": -1976.0,
"loss": 0.0086,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.40625,
"rewards/margins": 9.5,
"rewards/rejected": -16.875,
"step": 1330
},
{
"epoch": 2.780082987551867,
"grad_norm": 1.2970387106157468,
"learning_rate": 8.145062944090425e-09,
"logits/chosen": -0.6015625,
"logits/rejected": -0.6484375,
"logps/chosen": -952.0,
"logps/rejected": -1704.0,
"loss": 0.0222,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.28125,
"rewards/margins": 7.34375,
"rewards/rejected": -13.625,
"step": 1340
},
{
"epoch": 2.800829875518672,
"grad_norm": 2.5481269848678614,
"learning_rate": 6.687316669763937e-09,
"logits/chosen": -0.90234375,
"logits/rejected": -0.97265625,
"logps/chosen": -1080.0,
"logps/rejected": -1720.0,
"loss": 0.0177,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.96875,
"rewards/margins": 7.0625,
"rewards/rejected": -14.0,
"step": 1350
},
{
"epoch": 2.821576763485477,
"grad_norm": 2.514879468782849,
"learning_rate": 5.371439816754892e-09,
"logits/chosen": -0.61328125,
"logits/rejected": -0.71875,
"logps/chosen": -1048.0,
"logps/rejected": -1512.0,
"loss": 0.0179,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -7.0625,
"rewards/margins": 5.21875,
"rewards/rejected": -12.25,
"step": 1360
},
{
"epoch": 2.8423236514522823,
"grad_norm": 1.413587585861375,
"learning_rate": 4.198199639302152e-09,
"logits/chosen": -0.6171875,
"logits/rejected": -0.671875,
"logps/chosen": -1032.0,
"logps/rejected": -1888.0,
"loss": 0.0164,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.375,
"rewards/margins": 8.375,
"rewards/rejected": -15.75,
"step": 1370
},
{
"epoch": 2.863070539419087,
"grad_norm": 2.2560390766041793,
"learning_rate": 3.1682802238362506e-09,
"logits/chosen": -0.625,
"logits/rejected": -0.69921875,
"logps/chosen": -1104.0,
"logps/rejected": -1992.0,
"loss": 0.0077,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.75,
"rewards/margins": 9.0,
"rewards/rejected": -16.75,
"step": 1380
},
{
"epoch": 2.883817427385892,
"grad_norm": 2.6052476361718147,
"learning_rate": 2.2822820901060025e-09,
"logits/chosen": -0.76953125,
"logits/rejected": -0.69140625,
"logps/chosen": -1224.0,
"logps/rejected": -1768.0,
"loss": 0.0111,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.3125,
"rewards/margins": 7.46875,
"rewards/rejected": -14.8125,
"step": 1390
},
{
"epoch": 2.904564315352697,
"grad_norm": 0.8393040331914429,
"learning_rate": 1.5407218410307398e-09,
"logits/chosen": -0.765625,
"logits/rejected": -0.79296875,
"logps/chosen": -1136.0,
"logps/rejected": -1656.0,
"loss": 0.0104,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.375,
"rewards/margins": 4.84375,
"rewards/rejected": -13.1875,
"step": 1400
},
{
"epoch": 2.9253112033195023,
"grad_norm": 3.1380931604937596,
"learning_rate": 9.440318614823417e-10,
"logits/chosen": -0.62109375,
"logits/rejected": -0.62109375,
"logps/chosen": -972.0,
"logps/rejected": -1560.0,
"loss": 0.0148,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -6.96875,
"rewards/margins": 6.21875,
"rewards/rejected": -13.1875,
"step": 1410
},
{
"epoch": 2.9460580912863072,
"grad_norm": 1.4055560802364486,
"learning_rate": 4.925600661726537e-10,
"logits/chosen": -0.5859375,
"logits/rejected": -0.734375,
"logps/chosen": -1096.0,
"logps/rejected": -1984.0,
"loss": 0.013,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.21875,
"rewards/margins": 9.1875,
"rewards/rejected": -16.375,
"step": 1420
},
{
"epoch": 2.966804979253112,
"grad_norm": 1.2654931032152357,
"learning_rate": 1.8656969679323176e-10,
"logits/chosen": -0.78515625,
"logits/rejected": -0.84375,
"logps/chosen": -1208.0,
"logps/rejected": -1864.0,
"loss": 0.0111,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.53125,
"rewards/margins": 8.0,
"rewards/rejected": -15.5,
"step": 1430
},
{
"epoch": 2.987551867219917,
"grad_norm": 1.055552542271011,
"learning_rate": 2.6239168525898915e-11,
"logits/chosen": -0.73828125,
"logits/rejected": -0.59765625,
"logps/chosen": -1224.0,
"logps/rejected": -1792.0,
"loss": 0.0109,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.125,
"rewards/margins": 7.1875,
"rewards/rejected": -15.3125,
"step": 1440
},
{
"epoch": 3.0,
"step": 1446,
"total_flos": 0.0,
"train_loss": 0.21590606526138048,
"train_runtime": 112848.5516,
"train_samples_per_second": 0.819,
"train_steps_per_second": 0.013
}
],
"logging_steps": 10,
"max_steps": 1446,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}