{ "best_global_step": 850, "best_metric": 0.31901347637176514, "best_model_checkpoint": "/experiment_results/dpo/A-vibe_OPEN_SOURCE_checkpoint-1600_dpo_chosen_OUR_super_unsafe_from_PR_x15_NEW_CORRECT_04_10_25_v9/checkpoint-850", "epoch": 1.0, "eval_steps": 50, "global_step": 904, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011061946902654867, "grad_norm": 21.33904266357422, "learning_rate": 0.0, "logits/chosen": -1.55078125, "logits/rejected": -1.46875, "logps/chosen": -288.0, "logps/rejected": -235.5, "loss": 0.7017, "rewards/accuracies": 0.078125, "rewards/chosen": -0.007916450500488281, "rewards/margins": -0.0164794921875, "rewards/rejected": 0.00848388671875, "step": 1 }, { "epoch": 0.0022123893805309734, "grad_norm": 19.44487762451172, "learning_rate": 1.7857142857142856e-08, "logits/chosen": -1.5, "logits/rejected": -1.43359375, "logps/chosen": -259.0, "logps/rejected": -226.0, "loss": 0.6987, "rewards/accuracies": 0.171875, "rewards/chosen": -0.006072998046875, "rewards/margins": -0.007415771484375, "rewards/rejected": 0.0013580322265625, "step": 2 }, { "epoch": 0.00331858407079646, "grad_norm": 21.772796630859375, "learning_rate": 3.571428571428571e-08, "logits/chosen": -1.58984375, "logits/rejected": -1.54296875, "logps/chosen": -288.0, "logps/rejected": -286.0, "loss": 0.6943, "rewards/accuracies": 0.296875, "rewards/chosen": -0.0041046142578125, "rewards/margins": 0.002166748046875, "rewards/rejected": -0.0062713623046875, "step": 3 }, { "epoch": 0.004424778761061947, "grad_norm": 20.7520751953125, "learning_rate": 5.3571428571428564e-08, "logits/chosen": -1.65625, "logits/rejected": -1.6015625, "logps/chosen": -257.5, "logps/rejected": -243.0, "loss": 0.6858, "rewards/accuracies": 0.328125, "rewards/chosen": 0.0084075927734375, "rewards/margins": 0.0184326171875, "rewards/rejected": -0.009979248046875, "step": 4 }, { "epoch": 0.0055309734513274336, "grad_norm": 22.113121032714844, "learning_rate": 7.142857142857142e-08, "logits/chosen": -1.5234375, "logits/rejected": -1.53515625, "logps/chosen": -263.0, "logps/rejected": -262.5, "loss": 0.6965, "rewards/accuracies": 0.28125, "rewards/chosen": -0.008130073547363281, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.0049991607666015625, "step": 5 }, { "epoch": 0.00663716814159292, "grad_norm": 22.67697525024414, "learning_rate": 8.928571428571429e-08, "logits/chosen": -1.4609375, "logits/rejected": -1.62109375, "logps/chosen": -252.5, "logps/rejected": -259.5, "loss": 0.6851, "rewards/accuracies": 0.3046875, "rewards/chosen": 0.00469970703125, "rewards/margins": 0.013885498046875, "rewards/rejected": -0.009189605712890625, "step": 6 }, { "epoch": 0.007743362831858407, "grad_norm": 23.316373825073242, "learning_rate": 1.0714285714285713e-07, "logits/chosen": -1.46484375, "logits/rejected": -1.3984375, "logps/chosen": -279.0, "logps/rejected": -271.0, "loss": 0.698, "rewards/accuracies": 0.3125, "rewards/chosen": 0.00156402587890625, "rewards/margins": -0.0079498291015625, "rewards/rejected": 0.00946044921875, "step": 7 }, { "epoch": 0.008849557522123894, "grad_norm": 24.865726470947266, "learning_rate": 1.25e-07, "logits/chosen": -1.43359375, "logits/rejected": -1.546875, "logps/chosen": -275.0, "logps/rejected": -292.0, "loss": 0.7039, "rewards/accuracies": 0.203125, "rewards/chosen": -0.006072998046875, "rewards/margins": -0.01904296875, "rewards/rejected": 0.01300048828125, "step": 8 }, { "epoch": 0.00995575221238938, "grad_norm": 20.924415588378906, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -1.55859375, "logits/rejected": -1.51953125, "logps/chosen": -238.5, "logps/rejected": -238.5, "loss": 0.6892, "rewards/accuracies": 0.296875, "rewards/chosen": 0.0125274658203125, "rewards/margins": 0.0072021484375, "rewards/rejected": 0.0052642822265625, "step": 9 }, { "epoch": 0.011061946902654867, "grad_norm": 19.864246368408203, "learning_rate": 1.6071428571428573e-07, "logits/chosen": -1.56640625, "logits/rejected": -1.48046875, "logps/chosen": -249.0, "logps/rejected": -230.0, "loss": 0.6956, "rewards/accuracies": 0.3046875, "rewards/chosen": 0.0086822509765625, "rewards/margins": 0.00128173828125, "rewards/rejected": 0.0074615478515625, "step": 10 }, { "epoch": 0.012168141592920354, "grad_norm": 22.528316497802734, "learning_rate": 1.7857142857142858e-07, "logits/chosen": -1.59375, "logits/rejected": -1.5, "logps/chosen": -272.0, "logps/rejected": -290.0, "loss": 0.6936, "rewards/accuracies": 0.3359375, "rewards/chosen": -0.0045032501220703125, "rewards/margins": 0.0057544708251953125, "rewards/rejected": -0.01029062271118164, "step": 11 }, { "epoch": 0.01327433628318584, "grad_norm": 21.385112762451172, "learning_rate": 1.964285714285714e-07, "logits/chosen": -1.43359375, "logits/rejected": -1.38671875, "logps/chosen": -270.0, "logps/rejected": -281.0, "loss": 0.6895, "rewards/accuracies": 0.3359375, "rewards/chosen": 0.001373291015625, "rewards/margins": 0.0108795166015625, "rewards/rejected": -0.009471893310546875, "step": 12 }, { "epoch": 0.014380530973451327, "grad_norm": 21.703392028808594, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -1.51953125, "logits/rejected": -1.35546875, "logps/chosen": -258.0, "logps/rejected": -263.0, "loss": 0.7104, "rewards/accuracies": 0.2421875, "rewards/chosen": -0.0135040283203125, "rewards/margins": -0.0324249267578125, "rewards/rejected": 0.01898193359375, "step": 13 }, { "epoch": 0.015486725663716814, "grad_norm": 19.697071075439453, "learning_rate": 2.3214285714285714e-07, "logits/chosen": -1.4140625, "logits/rejected": -1.56640625, "logps/chosen": -248.0, "logps/rejected": -233.5, "loss": 0.6953, "rewards/accuracies": 0.3125, "rewards/chosen": 0.005401611328125, "rewards/margins": -0.001678466796875, "rewards/rejected": 0.007049560546875, "step": 14 }, { "epoch": 0.016592920353982302, "grad_norm": 21.335206985473633, "learning_rate": 2.5e-07, "logits/chosen": -1.56640625, "logits/rejected": -1.51953125, "logps/chosen": -272.0, "logps/rejected": -270.0, "loss": 0.6838, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0098724365234375, "rewards/margins": 0.0190277099609375, "rewards/rejected": -0.009204864501953125, "step": 15 }, { "epoch": 0.017699115044247787, "grad_norm": 21.42949867248535, "learning_rate": 2.6785714285714284e-07, "logits/chosen": -1.515625, "logits/rejected": -1.6328125, "logps/chosen": -248.5, "logps/rejected": -244.5, "loss": 0.6785, "rewards/accuracies": 0.34375, "rewards/chosen": 0.00927734375, "rewards/margins": 0.0289306640625, "rewards/rejected": -0.0196533203125, "step": 16 }, { "epoch": 0.018805309734513276, "grad_norm": 20.796878814697266, "learning_rate": 2.857142857142857e-07, "logits/chosen": -1.60546875, "logits/rejected": -1.625, "logps/chosen": -231.5, "logps/rejected": -231.5, "loss": 0.6899, "rewards/accuracies": 0.34375, "rewards/chosen": 0.00724029541015625, "rewards/margins": 0.011138916015625, "rewards/rejected": -0.00391387939453125, "step": 17 }, { "epoch": 0.01991150442477876, "grad_norm": 20.082786560058594, "learning_rate": 3.0357142857142855e-07, "logits/chosen": -1.46875, "logits/rejected": -1.40625, "logps/chosen": -251.0, "logps/rejected": -248.5, "loss": 0.688, "rewards/accuracies": 0.40625, "rewards/chosen": 0.016357421875, "rewards/margins": 0.0147705078125, "rewards/rejected": 0.0015716552734375, "step": 18 }, { "epoch": 0.02101769911504425, "grad_norm": 21.640682220458984, "learning_rate": 3.2142857142857145e-07, "logits/chosen": -1.59765625, "logits/rejected": -1.3515625, "logps/chosen": -264.0, "logps/rejected": -262.0, "loss": 0.6912, "rewards/accuracies": 0.3359375, "rewards/chosen": 0.01506805419921875, "rewards/margins": 0.005462646484375, "rewards/rejected": 0.00958251953125, "step": 19 }, { "epoch": 0.022123893805309734, "grad_norm": 22.128896713256836, "learning_rate": 3.392857142857143e-07, "logits/chosen": -1.57421875, "logits/rejected": -1.47265625, "logps/chosen": -267.5, "logps/rejected": -267.0, "loss": 0.6917, "rewards/accuracies": 0.3515625, "rewards/chosen": 0.01458740234375, "rewards/margins": 0.0075225830078125, "rewards/rejected": 0.007049560546875, "step": 20 }, { "epoch": 0.023230088495575223, "grad_norm": 20.139122009277344, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -1.58203125, "logits/rejected": -1.46484375, "logps/chosen": -251.5, "logps/rejected": -251.0, "loss": 0.699, "rewards/accuracies": 0.3203125, "rewards/chosen": 0.003238677978515625, "rewards/margins": -0.0052642822265625, "rewards/rejected": 0.008502960205078125, "step": 21 }, { "epoch": 0.024336283185840708, "grad_norm": 20.964323043823242, "learning_rate": 3.75e-07, "logits/chosen": -1.55859375, "logits/rejected": -1.49609375, "logps/chosen": -236.0, "logps/rejected": -260.5, "loss": 0.6882, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.02685546875, "rewards/margins": 0.013885498046875, "rewards/rejected": 0.01297760009765625, "step": 22 }, { "epoch": 0.025442477876106196, "grad_norm": 19.556018829345703, "learning_rate": 3.928571428571428e-07, "logits/chosen": -1.6015625, "logits/rejected": -1.4296875, "logps/chosen": -234.0, "logps/rejected": -210.0, "loss": 0.6941, "rewards/accuracies": 0.359375, "rewards/chosen": 0.01114654541015625, "rewards/margins": 0.00146484375, "rewards/rejected": 0.0096893310546875, "step": 23 }, { "epoch": 0.02654867256637168, "grad_norm": 195.61749267578125, "learning_rate": 4.1071428571428566e-07, "logits/chosen": -1.59375, "logits/rejected": -1.328125, "logps/chosen": -264.0, "logps/rejected": -329.5, "loss": 0.676, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.0570068359375, "rewards/margins": -0.006103515625, "rewards/rejected": 0.0631866455078125, "step": 24 }, { "epoch": 0.02765486725663717, "grad_norm": 21.722719192504883, "learning_rate": 4.285714285714285e-07, "logits/chosen": -1.4375, "logits/rejected": -1.546875, "logps/chosen": -259.0, "logps/rejected": -269.0, "loss": 0.6887, "rewards/accuracies": 0.453125, "rewards/chosen": 0.0538330078125, "rewards/margins": 0.0157470703125, "rewards/rejected": 0.03802490234375, "step": 25 }, { "epoch": 0.028761061946902654, "grad_norm": 22.364490509033203, "learning_rate": 4.464285714285714e-07, "logits/chosen": -1.41015625, "logits/rejected": -1.36328125, "logps/chosen": -296.0, "logps/rejected": -305.0, "loss": 0.6882, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0601806640625, "rewards/margins": 0.01422119140625, "rewards/rejected": 0.0460205078125, "step": 26 }, { "epoch": 0.029867256637168143, "grad_norm": 20.38817024230957, "learning_rate": 4.6428571428571427e-07, "logits/chosen": -1.44140625, "logits/rejected": -1.390625, "logps/chosen": -280.0, "logps/rejected": -265.0, "loss": 0.6743, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.0726318359375, "rewards/margins": 0.0447998046875, "rewards/rejected": 0.02783203125, "step": 27 }, { "epoch": 0.030973451327433628, "grad_norm": 21.340524673461914, "learning_rate": 4.821428571428571e-07, "logits/chosen": -1.4609375, "logits/rejected": -1.49609375, "logps/chosen": -263.0, "logps/rejected": -233.5, "loss": 0.6704, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.0859375, "rewards/margins": 0.05419921875, "rewards/rejected": 0.03167724609375, "step": 28 }, { "epoch": 0.032079646017699116, "grad_norm": 22.794097900390625, "learning_rate": 5e-07, "logits/chosen": -1.47265625, "logits/rejected": -1.52734375, "logps/chosen": -251.5, "logps/rejected": -277.0, "loss": 0.6665, "rewards/accuracies": 0.5, "rewards/chosen": 0.100341796875, "rewards/margins": 0.0615234375, "rewards/rejected": 0.03863525390625, "step": 29 }, { "epoch": 0.033185840707964605, "grad_norm": 21.916282653808594, "learning_rate": 4.999983923145526e-07, "logits/chosen": -1.45703125, "logits/rejected": -1.44140625, "logps/chosen": -268.0, "logps/rejected": -271.0, "loss": 0.6672, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.087158203125, "rewards/margins": 0.052978515625, "rewards/rejected": 0.03411865234375, "step": 30 }, { "epoch": 0.034292035398230086, "grad_norm": 20.50246810913086, "learning_rate": 4.999935692788877e-07, "logits/chosen": -1.44140625, "logits/rejected": -1.42578125, "logps/chosen": -263.0, "logps/rejected": -280.0, "loss": 0.6626, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.1103515625, "rewards/margins": 0.071533203125, "rewards/rejected": 0.03887939453125, "step": 31 }, { "epoch": 0.035398230088495575, "grad_norm": 21.142545700073242, "learning_rate": 4.999855309550366e-07, "logits/chosen": -1.54296875, "logits/rejected": -1.5859375, "logps/chosen": -291.0, "logps/rejected": -268.0, "loss": 0.6704, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.091552734375, "rewards/margins": 0.0552978515625, "rewards/rejected": 0.03607177734375, "step": 32 }, { "epoch": 0.03650442477876106, "grad_norm": 20.50800895690918, "learning_rate": 4.999742774463842e-07, "logits/chosen": -1.4375, "logits/rejected": -1.40234375, "logps/chosen": -256.5, "logps/rejected": -270.0, "loss": 0.6494, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.1484375, "rewards/margins": 0.092041015625, "rewards/rejected": 0.0565185546875, "step": 33 }, { "epoch": 0.03761061946902655, "grad_norm": 19.532590866088867, "learning_rate": 4.999598088976672e-07, "logits/chosen": -1.49609375, "logits/rejected": -1.4765625, "logps/chosen": -250.0, "logps/rejected": -260.0, "loss": 0.6445, "rewards/accuracies": 0.59375, "rewards/chosen": 0.177734375, "rewards/margins": 0.105224609375, "rewards/rejected": 0.072509765625, "step": 34 }, { "epoch": 0.03871681415929203, "grad_norm": 20.883621215820312, "learning_rate": 4.999421254949727e-07, "logits/chosen": -1.5390625, "logits/rejected": -1.4140625, "logps/chosen": -271.0, "logps/rejected": -270.0, "loss": 0.6501, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.16943359375, "rewards/margins": 0.103271484375, "rewards/rejected": 0.0655517578125, "step": 35 }, { "epoch": 0.03982300884955752, "grad_norm": 20.3232479095459, "learning_rate": 4.999212274657353e-07, "logits/chosen": -1.51953125, "logits/rejected": -1.46484375, "logps/chosen": -257.0, "logps/rejected": -255.5, "loss": 0.6428, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.2080078125, "rewards/margins": 0.114013671875, "rewards/rejected": 0.09375, "step": 36 }, { "epoch": 0.04092920353982301, "grad_norm": 21.007495880126953, "learning_rate": 4.99897115078735e-07, "logits/chosen": -1.4609375, "logits/rejected": -1.58984375, "logps/chosen": -259.5, "logps/rejected": -253.0, "loss": 0.636, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.24609375, "rewards/margins": 0.13525390625, "rewards/rejected": 0.110595703125, "step": 37 }, { "epoch": 0.0420353982300885, "grad_norm": 18.463993072509766, "learning_rate": 4.998697886440926e-07, "logits/chosen": -1.5078125, "logits/rejected": -1.4375, "logps/chosen": -242.0, "logps/rejected": -246.0, "loss": 0.6384, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.23681640625, "rewards/margins": 0.130859375, "rewards/rejected": 0.106201171875, "step": 38 }, { "epoch": 0.04314159292035398, "grad_norm": 20.67741584777832, "learning_rate": 4.998392485132666e-07, "logits/chosen": -1.49609375, "logits/rejected": -1.375, "logps/chosen": -267.0, "logps/rejected": -275.0, "loss": 0.6331, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.275390625, "rewards/margins": 0.14990234375, "rewards/rejected": 0.12548828125, "step": 39 }, { "epoch": 0.04424778761061947, "grad_norm": 20.392776489257812, "learning_rate": 4.998054950790485e-07, "logits/chosen": -1.359375, "logits/rejected": -1.4609375, "logps/chosen": -275.0, "logps/rejected": -285.0, "loss": 0.6218, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2841796875, "rewards/margins": 0.1650390625, "rewards/rejected": 0.11865234375, "step": 40 }, { "epoch": 0.04535398230088496, "grad_norm": 19.42493438720703, "learning_rate": 4.997685287755575e-07, "logits/chosen": -1.515625, "logits/rejected": -1.3828125, "logps/chosen": -271.0, "logps/rejected": -262.5, "loss": 0.6274, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.2744140625, "rewards/margins": 0.15234375, "rewards/rejected": 0.12158203125, "step": 41 }, { "epoch": 0.046460176991150445, "grad_norm": 19.07245635986328, "learning_rate": 4.99728350078235e-07, "logits/chosen": -1.53515625, "logits/rejected": -1.4453125, "logps/chosen": -274.0, "logps/rejected": -251.5, "loss": 0.6108, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.3173828125, "rewards/margins": 0.18896484375, "rewards/rejected": 0.127685546875, "step": 42 }, { "epoch": 0.04756637168141593, "grad_norm": 19.7177677154541, "learning_rate": 4.996849595038388e-07, "logits/chosen": -1.515625, "logits/rejected": -1.49609375, "logps/chosen": -273.5, "logps/rejected": -281.0, "loss": 0.6208, "rewards/accuracies": 0.578125, "rewards/chosen": 0.3310546875, "rewards/margins": 0.17529296875, "rewards/rejected": 0.15625, "step": 43 }, { "epoch": 0.048672566371681415, "grad_norm": 19.820003509521484, "learning_rate": 4.996383576104361e-07, "logits/chosen": -1.5234375, "logits/rejected": -1.421875, "logps/chosen": -261.0, "logps/rejected": -263.5, "loss": 0.6196, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.330078125, "rewards/margins": 0.1796875, "rewards/rejected": 0.15087890625, "step": 44 }, { "epoch": 0.049778761061946904, "grad_norm": 20.092729568481445, "learning_rate": 4.995885449973962e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.39453125, "logps/chosen": -293.0, "logps/rejected": -295.0, "loss": 0.6111, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.3408203125, "rewards/margins": 0.20068359375, "rewards/rejected": 0.140380859375, "step": 45 }, { "epoch": 0.05088495575221239, "grad_norm": 18.567899703979492, "learning_rate": 4.995355223053834e-07, "logits/chosen": -1.5, "logits/rejected": -1.44921875, "logps/chosen": -260.5, "logps/rejected": -255.5, "loss": 0.6146, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3359375, "rewards/margins": 0.1982421875, "rewards/rejected": 0.13720703125, "step": 46 }, { "epoch": 0.051991150442477874, "grad_norm": 20.356060028076172, "learning_rate": 4.994792902163481e-07, "logits/chosen": -1.45703125, "logits/rejected": -1.29296875, "logps/chosen": -280.0, "logps/rejected": -260.0, "loss": 0.627, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.357421875, "rewards/margins": 0.1787109375, "rewards/rejected": 0.1787109375, "step": 47 }, { "epoch": 0.05309734513274336, "grad_norm": 20.717748641967773, "learning_rate": 4.994198494535182e-07, "logits/chosen": -1.4765625, "logits/rejected": -1.41796875, "logps/chosen": -280.0, "logps/rejected": -281.0, "loss": 0.5881, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.4091796875, "rewards/margins": 0.25390625, "rewards/rejected": 0.1552734375, "step": 48 }, { "epoch": 0.05420353982300885, "grad_norm": 19.05792236328125, "learning_rate": 4.993572007813904e-07, "logits/chosen": -1.390625, "logits/rejected": -1.35546875, "logps/chosen": -251.5, "logps/rejected": -277.0, "loss": 0.5889, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.41015625, "rewards/margins": 0.263671875, "rewards/rejected": 0.14599609375, "step": 49 }, { "epoch": 0.05530973451327434, "grad_norm": 17.29762840270996, "learning_rate": 4.992913450057195e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.35546875, "logps/chosen": -237.0, "logps/rejected": -224.5, "loss": 0.5867, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.4501953125, "rewards/margins": 0.26953125, "rewards/rejected": 0.1796875, "step": 50 }, { "epoch": 0.05530973451327434, "eval_logits/chosen": -1.4197372198104858, "eval_logits/rejected": -1.4136348962783813, "eval_logps/chosen": -255.96517944335938, "eval_logps/rejected": -257.37811279296875, "eval_loss": 0.5778365731239319, "eval_rewards/accuracies": 0.6217424273490906, "eval_rewards/chosen": 0.4856770932674408, "eval_rewards/margins": 0.2985657751560211, "eval_rewards/rejected": 0.1872473508119583, "eval_runtime": 210.1095, "eval_samples_per_second": 61.173, "eval_steps_per_second": 0.957, "step": 50 }, { "epoch": 0.05641592920353982, "grad_norm": 18.575069427490234, "learning_rate": 4.992222829735082e-07, "logits/chosen": -1.5078125, "logits/rejected": -1.421875, "logps/chosen": -260.0, "logps/rejected": -259.0, "loss": 0.5874, "rewards/accuracies": 0.609375, "rewards/chosen": 0.48046875, "rewards/margins": 0.27734375, "rewards/rejected": 0.203125, "step": 51 }, { "epoch": 0.05752212389380531, "grad_norm": 17.976177215576172, "learning_rate": 4.991500155729971e-07, "logits/chosen": -1.42578125, "logits/rejected": -1.43359375, "logps/chosen": -252.5, "logps/rejected": -256.0, "loss": 0.575, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.50390625, "rewards/margins": 0.30078125, "rewards/rejected": 0.20458984375, "step": 52 }, { "epoch": 0.0586283185840708, "grad_norm": 17.979496002197266, "learning_rate": 4.99074543733652e-07, "logits/chosen": -1.4453125, "logits/rejected": -1.46875, "logps/chosen": -265.0, "logps/rejected": -267.0, "loss": 0.5444, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.55859375, "rewards/margins": 0.3984375, "rewards/rejected": 0.1611328125, "step": 53 }, { "epoch": 0.059734513274336286, "grad_norm": 17.736249923706055, "learning_rate": 4.989958684261526e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.375, "logps/chosen": -255.5, "logps/rejected": -292.0, "loss": 0.5529, "rewards/accuracies": 0.59375, "rewards/chosen": 0.568359375, "rewards/margins": 0.39453125, "rewards/rejected": 0.17333984375, "step": 54 }, { "epoch": 0.06084070796460177, "grad_norm": 17.284420013427734, "learning_rate": 4.989139906623802e-07, "logits/chosen": -1.44140625, "logits/rejected": -1.42578125, "logps/chosen": -253.0, "logps/rejected": -256.5, "loss": 0.5522, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.599609375, "rewards/margins": 0.373046875, "rewards/rejected": 0.2255859375, "step": 55 }, { "epoch": 0.061946902654867256, "grad_norm": 17.704713821411133, "learning_rate": 4.988289114954044e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.35546875, "logps/chosen": -237.5, "logps/rejected": -260.0, "loss": 0.5504, "rewards/accuracies": 0.609375, "rewards/chosen": 0.654296875, "rewards/margins": 0.3896484375, "rewards/rejected": 0.263671875, "step": 56 }, { "epoch": 0.06305309734513274, "grad_norm": 17.688888549804688, "learning_rate": 4.987406320194694e-07, "logits/chosen": -1.453125, "logits/rejected": -1.3359375, "logps/chosen": -242.5, "logps/rejected": -247.5, "loss": 0.5537, "rewards/accuracies": 0.65625, "rewards/chosen": 0.69140625, "rewards/margins": 0.3857421875, "rewards/rejected": 0.3046875, "step": 57 }, { "epoch": 0.06415929203539823, "grad_norm": 17.646419525146484, "learning_rate": 4.986491533699802e-07, "logits/chosen": -1.41015625, "logits/rejected": -1.37109375, "logps/chosen": -256.5, "logps/rejected": -281.0, "loss": 0.5431, "rewards/accuracies": 0.59375, "rewards/chosen": 0.71484375, "rewards/margins": 0.439453125, "rewards/rejected": 0.27490234375, "step": 58 }, { "epoch": 0.06526548672566372, "grad_norm": 17.19894027709961, "learning_rate": 4.985544767234879e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.4453125, "logps/chosen": -245.5, "logps/rejected": -250.5, "loss": 0.5403, "rewards/accuracies": 0.609375, "rewards/chosen": 0.7421875, "rewards/margins": 0.4541015625, "rewards/rejected": 0.287109375, "step": 59 }, { "epoch": 0.06637168141592921, "grad_norm": 16.630441665649414, "learning_rate": 4.984566032976749e-07, "logits/chosen": -1.390625, "logits/rejected": -1.33984375, "logps/chosen": -248.0, "logps/rejected": -254.5, "loss": 0.5386, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.828125, "rewards/margins": 0.466796875, "rewards/rejected": 0.361328125, "step": 60 }, { "epoch": 0.06747787610619468, "grad_norm": 17.252979278564453, "learning_rate": 4.983555343513384e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.4375, "logps/chosen": -250.5, "logps/rejected": -275.5, "loss": 0.5028, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8515625, "rewards/margins": 0.578125, "rewards/rejected": 0.271484375, "step": 61 }, { "epoch": 0.06858407079646017, "grad_norm": 17.014602661132812, "learning_rate": 4.982512711843752e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.30078125, "logps/chosen": -242.0, "logps/rejected": -243.5, "loss": 0.5159, "rewards/accuracies": 0.671875, "rewards/chosen": 0.94140625, "rewards/margins": 0.5458984375, "rewards/rejected": 0.39453125, "step": 62 }, { "epoch": 0.06969026548672566, "grad_norm": 16.877351760864258, "learning_rate": 4.98143815137764e-07, "logits/chosen": -1.40234375, "logits/rejected": -1.36328125, "logps/chosen": -264.0, "logps/rejected": -285.0, "loss": 0.5343, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.857421875, "rewards/margins": 0.4853515625, "rewards/rejected": 0.373046875, "step": 63 }, { "epoch": 0.07079646017699115, "grad_norm": 17.047161102294922, "learning_rate": 4.980331675935493e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.34375, "logps/chosen": -254.5, "logps/rejected": -291.0, "loss": 0.5211, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.962890625, "rewards/margins": 0.533203125, "rewards/rejected": 0.4296875, "step": 64 }, { "epoch": 0.07190265486725664, "grad_norm": 17.61670684814453, "learning_rate": 4.979193299748224e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.3828125, "logps/chosen": -265.0, "logps/rejected": -278.0, "loss": 0.4878, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.96484375, "rewards/margins": 0.6640625, "rewards/rejected": 0.2998046875, "step": 65 }, { "epoch": 0.07300884955752213, "grad_norm": 17.460668563842773, "learning_rate": 4.978023037457043e-07, "logits/chosen": -1.44921875, "logits/rejected": -1.3515625, "logps/chosen": -267.0, "logps/rejected": -279.0, "loss": 0.536, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.95703125, "rewards/margins": 0.5390625, "rewards/rejected": 0.416015625, "step": 66 }, { "epoch": 0.07411504424778761, "grad_norm": 181.168212890625, "learning_rate": 4.976820904113256e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.30859375, "logps/chosen": -233.5, "logps/rejected": -340.0, "loss": 0.489, "rewards/accuracies": 0.734375, "rewards/chosen": 1.05859375, "rewards/margins": 0.646484375, "rewards/rejected": 0.41015625, "step": 67 }, { "epoch": 0.0752212389380531, "grad_norm": 15.64547061920166, "learning_rate": 4.975586915178084e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.40234375, "logps/chosen": -241.5, "logps/rejected": -256.0, "loss": 0.4702, "rewards/accuracies": 0.6953125, "rewards/chosen": 1.041015625, "rewards/margins": 0.716796875, "rewards/rejected": 0.32421875, "step": 68 }, { "epoch": 0.07632743362831858, "grad_norm": 16.069597244262695, "learning_rate": 4.974321086522452e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.234375, "logps/chosen": -256.5, "logps/rejected": -251.5, "loss": 0.5188, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.986328125, "rewards/margins": 0.568359375, "rewards/rejected": 0.4208984375, "step": 69 }, { "epoch": 0.07743362831858407, "grad_norm": 149.1916046142578, "learning_rate": 4.973023434426798e-07, "logits/chosen": -1.4140625, "logits/rejected": -1.421875, "logps/chosen": -248.5, "logps/rejected": -247.0, "loss": 0.5642, "rewards/accuracies": 0.6875, "rewards/chosen": 0.96875, "rewards/margins": 0.5107421875, "rewards/rejected": 0.458984375, "step": 70 }, { "epoch": 0.07853982300884955, "grad_norm": 16.17060089111328, "learning_rate": 4.971693975580851e-07, "logits/chosen": -1.390625, "logits/rejected": -1.28515625, "logps/chosen": -232.0, "logps/rejected": -241.0, "loss": 0.5079, "rewards/accuracies": 0.6328125, "rewards/chosen": 1.025390625, "rewards/margins": 0.61328125, "rewards/rejected": 0.4140625, "step": 71 }, { "epoch": 0.07964601769911504, "grad_norm": 17.631471633911133, "learning_rate": 4.970332727083425e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.35546875, "logps/chosen": -271.5, "logps/rejected": -283.0, "loss": 0.5212, "rewards/accuracies": 0.625, "rewards/chosen": 1.1328125, "rewards/margins": 0.619140625, "rewards/rejected": 0.5166015625, "step": 72 }, { "epoch": 0.08075221238938053, "grad_norm": 17.61063003540039, "learning_rate": 4.968939706442195e-07, "logits/chosen": -1.39453125, "logits/rejected": -1.203125, "logps/chosen": -275.0, "logps/rejected": -255.5, "loss": 0.5211, "rewards/accuracies": 0.640625, "rewards/chosen": 0.99609375, "rewards/margins": 0.583984375, "rewards/rejected": 0.4130859375, "step": 73 }, { "epoch": 0.08185840707964602, "grad_norm": 16.28321075439453, "learning_rate": 4.967514931573472e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.41015625, "logps/chosen": -243.0, "logps/rejected": -258.0, "loss": 0.4977, "rewards/accuracies": 0.6640625, "rewards/chosen": 1.20703125, "rewards/margins": 0.689453125, "rewards/rejected": 0.5146484375, "step": 74 }, { "epoch": 0.08296460176991151, "grad_norm": 16.079599380493164, "learning_rate": 4.966058420801977e-07, "logits/chosen": -1.34765625, "logits/rejected": -1.35546875, "logps/chosen": -259.5, "logps/rejected": -244.0, "loss": 0.4648, "rewards/accuracies": 0.6796875, "rewards/chosen": 1.13671875, "rewards/margins": 0.779296875, "rewards/rejected": 0.3583984375, "step": 75 }, { "epoch": 0.084070796460177, "grad_norm": 18.725271224975586, "learning_rate": 4.964570192860596e-07, "logits/chosen": -1.359375, "logits/rejected": -1.36328125, "logps/chosen": -287.0, "logps/rejected": -250.5, "loss": 0.5436, "rewards/accuracies": 0.5859375, "rewards/chosen": 1.109375, "rewards/margins": 0.5234375, "rewards/rejected": 0.583984375, "step": 76 }, { "epoch": 0.08517699115044247, "grad_norm": 15.678024291992188, "learning_rate": 4.963050266890152e-07, "logits/chosen": -1.328125, "logits/rejected": -1.4609375, "logps/chosen": -253.0, "logps/rejected": -246.0, "loss": 0.4833, "rewards/accuracies": 0.703125, "rewards/chosen": 1.15625, "rewards/margins": 0.767578125, "rewards/rejected": 0.390625, "step": 77 }, { "epoch": 0.08628318584070796, "grad_norm": 15.54704761505127, "learning_rate": 4.961498662439145e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.359375, "logps/chosen": -230.5, "logps/rejected": -249.0, "loss": 0.4718, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.3046875, "rewards/margins": 0.82421875, "rewards/rejected": 0.48046875, "step": 78 }, { "epoch": 0.08738938053097345, "grad_norm": 16.073156356811523, "learning_rate": 4.959915399463512e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.27734375, "logps/chosen": -246.0, "logps/rejected": -259.5, "loss": 0.4602, "rewards/accuracies": 0.671875, "rewards/chosen": 1.27734375, "rewards/margins": 0.85546875, "rewards/rejected": 0.423828125, "step": 79 }, { "epoch": 0.08849557522123894, "grad_norm": 14.94100284576416, "learning_rate": 4.958300498326362e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.40234375, "logps/chosen": -231.0, "logps/rejected": -264.5, "loss": 0.4397, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.30078125, "rewards/margins": 0.953125, "rewards/rejected": 0.345703125, "step": 80 }, { "epoch": 0.08960176991150443, "grad_norm": 17.28353500366211, "learning_rate": 4.956653979797721e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.3828125, "logps/chosen": -280.0, "logps/rejected": -259.5, "loss": 0.5153, "rewards/accuracies": 0.609375, "rewards/chosen": 1.18359375, "rewards/margins": 0.71875, "rewards/rejected": 0.4658203125, "step": 81 }, { "epoch": 0.09070796460176991, "grad_norm": 15.823220252990723, "learning_rate": 4.954975865054259e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.32421875, "logps/chosen": -255.5, "logps/rejected": -256.5, "loss": 0.4614, "rewards/accuracies": 0.6875, "rewards/chosen": 1.34375, "rewards/margins": 0.87109375, "rewards/rejected": 0.474609375, "step": 82 }, { "epoch": 0.0918141592920354, "grad_norm": 14.873113632202148, "learning_rate": 4.953266175679023e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.3125, "logps/chosen": -236.5, "logps/rejected": -241.5, "loss": 0.4586, "rewards/accuracies": 0.703125, "rewards/chosen": 1.33984375, "rewards/margins": 0.869140625, "rewards/rejected": 0.47265625, "step": 83 }, { "epoch": 0.09292035398230089, "grad_norm": 16.7225341796875, "learning_rate": 4.951524933661154e-07, "logits/chosen": -1.328125, "logits/rejected": -1.30078125, "logps/chosen": -256.0, "logps/rejected": -237.0, "loss": 0.5129, "rewards/accuracies": 0.640625, "rewards/chosen": 1.1953125, "rewards/margins": 0.6953125, "rewards/rejected": 0.5009765625, "step": 84 }, { "epoch": 0.09402654867256637, "grad_norm": 15.362020492553711, "learning_rate": 4.949752161395605e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.24609375, "logps/chosen": -257.0, "logps/rejected": -252.0, "loss": 0.4339, "rewards/accuracies": 0.703125, "rewards/chosen": 1.2890625, "rewards/margins": 0.990234375, "rewards/rejected": 0.30078125, "step": 85 }, { "epoch": 0.09513274336283185, "grad_norm": 15.91197681427002, "learning_rate": 4.94794788168286e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.24609375, "logps/chosen": -229.5, "logps/rejected": -255.5, "loss": 0.4675, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3203125, "rewards/margins": 0.85546875, "rewards/rejected": 0.4658203125, "step": 86 }, { "epoch": 0.09623893805309734, "grad_norm": 15.647570610046387, "learning_rate": 4.946112117728634e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.3359375, "logps/chosen": -243.0, "logps/rejected": -235.5, "loss": 0.4574, "rewards/accuracies": 0.671875, "rewards/chosen": 1.28515625, "rewards/margins": 0.865234375, "rewards/rejected": 0.41796875, "step": 87 }, { "epoch": 0.09734513274336283, "grad_norm": 17.21525001525879, "learning_rate": 4.944244893143572e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.26953125, "logps/chosen": -268.0, "logps/rejected": -264.0, "loss": 0.4832, "rewards/accuracies": 0.671875, "rewards/chosen": 1.28515625, "rewards/margins": 0.826171875, "rewards/rejected": 0.4580078125, "step": 88 }, { "epoch": 0.09845132743362832, "grad_norm": 16.55433464050293, "learning_rate": 4.942346231942955e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.3828125, "logps/chosen": -255.0, "logps/rejected": -258.5, "loss": 0.4967, "rewards/accuracies": 0.609375, "rewards/chosen": 1.3359375, "rewards/margins": 0.80078125, "rewards/rejected": 0.53515625, "step": 89 }, { "epoch": 0.09955752212389381, "grad_norm": 15.434545516967773, "learning_rate": 4.94041615854638e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.28515625, "logps/chosen": -265.0, "logps/rejected": -262.5, "loss": 0.4258, "rewards/accuracies": 0.6640625, "rewards/chosen": 1.42578125, "rewards/margins": 1.0390625, "rewards/rejected": 0.3876953125, "step": 90 }, { "epoch": 0.1006637168141593, "grad_norm": 16.600032806396484, "learning_rate": 4.938454697777457e-07, "logits/chosen": -1.234375, "logits/rejected": -1.1953125, "logps/chosen": -279.0, "logps/rejected": -278.0, "loss": 0.473, "rewards/accuracies": 0.671875, "rewards/chosen": 1.265625, "rewards/margins": 0.8828125, "rewards/rejected": 0.3818359375, "step": 91 }, { "epoch": 0.10176991150442478, "grad_norm": 17.01357078552246, "learning_rate": 4.936461874863479e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.3828125, "logps/chosen": -255.5, "logps/rejected": -286.0, "loss": 0.495, "rewards/accuracies": 0.640625, "rewards/chosen": 1.3125, "rewards/margins": 0.833984375, "rewards/rejected": 0.4765625, "step": 92 }, { "epoch": 0.10287610619469026, "grad_norm": 15.263469696044922, "learning_rate": 4.934437715435107e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.33203125, "logps/chosen": -244.5, "logps/rejected": -245.0, "loss": 0.4545, "rewards/accuracies": 0.6875, "rewards/chosen": 1.328125, "rewards/margins": 0.955078125, "rewards/rejected": 0.3759765625, "step": 93 }, { "epoch": 0.10398230088495575, "grad_norm": 15.016999244689941, "learning_rate": 4.932382245526034e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.21875, "logps/chosen": -245.0, "logps/rejected": -256.5, "loss": 0.4381, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.34375, "rewards/margins": 0.970703125, "rewards/rejected": 0.3740234375, "step": 94 }, { "epoch": 0.10508849557522124, "grad_norm": 16.34784507751465, "learning_rate": 4.930295491572653e-07, "logits/chosen": -1.37109375, "logits/rejected": -1.3203125, "logps/chosen": -247.0, "logps/rejected": -260.0, "loss": 0.4766, "rewards/accuracies": 0.640625, "rewards/chosen": 1.34765625, "rewards/margins": 0.87890625, "rewards/rejected": 0.46875, "step": 95 }, { "epoch": 0.10619469026548672, "grad_norm": 15.943212509155273, "learning_rate": 4.928177480413714e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.28125, "logps/chosen": -259.0, "logps/rejected": -270.5, "loss": 0.4727, "rewards/accuracies": 0.6484375, "rewards/chosen": 1.3515625, "rewards/margins": 0.955078125, "rewards/rejected": 0.3955078125, "step": 96 }, { "epoch": 0.10730088495575221, "grad_norm": 16.164276123046875, "learning_rate": 4.926028239289984e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.38671875, "logps/chosen": -273.5, "logps/rejected": -268.5, "loss": 0.4556, "rewards/accuracies": 0.6640625, "rewards/chosen": 1.37890625, "rewards/margins": 0.97265625, "rewards/rejected": 0.4072265625, "step": 97 }, { "epoch": 0.1084070796460177, "grad_norm": 16.16509437561035, "learning_rate": 4.923847795843893e-07, "logits/chosen": -1.359375, "logits/rejected": -1.2265625, "logps/chosen": -270.0, "logps/rejected": -277.0, "loss": 0.4657, "rewards/accuracies": 0.6875, "rewards/chosen": 1.265625, "rewards/margins": 0.947265625, "rewards/rejected": 0.3173828125, "step": 98 }, { "epoch": 0.10951327433628319, "grad_norm": 16.42505645751953, "learning_rate": 4.921636178119177e-07, "logits/chosen": -1.47265625, "logits/rejected": -1.16796875, "logps/chosen": -251.0, "logps/rejected": -232.5, "loss": 0.4747, "rewards/accuracies": 0.6328125, "rewards/chosen": 1.296875, "rewards/margins": 0.8671875, "rewards/rejected": 0.4287109375, "step": 99 }, { "epoch": 0.11061946902654868, "grad_norm": 23.872831344604492, "learning_rate": 4.919393414560522e-07, "logits/chosen": -1.39453125, "logits/rejected": -1.3125, "logps/chosen": -246.0, "logps/rejected": -249.5, "loss": 0.4521, "rewards/accuracies": 0.6875, "rewards/chosen": 1.26171875, "rewards/margins": 0.93359375, "rewards/rejected": 0.328125, "step": 100 }, { "epoch": 0.11061946902654868, "eval_logits/chosen": -1.329796314239502, "eval_logits/rejected": -1.3044931888580322, "eval_logps/chosen": -247.93531799316406, "eval_logps/rejected": -256.3333435058594, "eval_loss": 0.4438014328479767, "eval_rewards/accuracies": 0.7061508893966675, "eval_rewards/chosen": 1.2971081733703613, "eval_rewards/margins": 1.0102806091308594, "eval_rewards/rejected": 0.2868901491165161, "eval_runtime": 193.1281, "eval_samples_per_second": 66.552, "eval_steps_per_second": 1.041, "step": 100 }, { "epoch": 0.11172566371681415, "grad_norm": 13.894512176513672, "learning_rate": 4.917119534013193e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.234375, "logps/chosen": -233.0, "logps/rejected": -233.0, "loss": 0.418, "rewards/accuracies": 0.71875, "rewards/chosen": 1.25390625, "rewards/margins": 1.0546875, "rewards/rejected": 0.1982421875, "step": 101 }, { "epoch": 0.11283185840707964, "grad_norm": 15.946537017822266, "learning_rate": 4.91481456572267e-07, "logits/chosen": -1.359375, "logits/rejected": -1.23828125, "logps/chosen": -251.5, "logps/rejected": -249.5, "loss": 0.4642, "rewards/accuracies": 0.6953125, "rewards/chosen": 1.22265625, "rewards/margins": 0.91796875, "rewards/rejected": 0.30517578125, "step": 102 }, { "epoch": 0.11393805309734513, "grad_norm": 13.790292739868164, "learning_rate": 4.912478539334264e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.35546875, "logps/chosen": -223.5, "logps/rejected": -241.0, "loss": 0.3972, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.40625, "rewards/margins": 1.14453125, "rewards/rejected": 0.25634765625, "step": 103 }, { "epoch": 0.11504424778761062, "grad_norm": 14.399200439453125, "learning_rate": 4.910111484892739e-07, "logits/chosen": -1.296875, "logits/rejected": -1.26171875, "logps/chosen": -240.5, "logps/rejected": -260.5, "loss": 0.3929, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.36328125, "rewards/margins": 1.23046875, "rewards/rejected": 0.1336669921875, "step": 104 }, { "epoch": 0.1161504424778761, "grad_norm": 16.46123504638672, "learning_rate": 4.907713432841928e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.14453125, "logps/chosen": -255.0, "logps/rejected": -229.0, "loss": 0.5001, "rewards/accuracies": 0.6171875, "rewards/chosen": 1.0625, "rewards/margins": 0.828125, "rewards/rejected": 0.23486328125, "step": 105 }, { "epoch": 0.1172566371681416, "grad_norm": 16.047285079956055, "learning_rate": 4.905284414024337e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.40234375, "logps/chosen": -242.5, "logps/rejected": -284.0, "loss": 0.4525, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.22265625, "rewards/margins": 0.986328125, "rewards/rejected": 0.23388671875, "step": 106 }, { "epoch": 0.11836283185840708, "grad_norm": 16.925933837890625, "learning_rate": 4.902824459680752e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.3125, "logps/chosen": -265.0, "logps/rejected": -266.0, "loss": 0.46, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.19140625, "rewards/margins": 0.978515625, "rewards/rejected": 0.209228515625, "step": 107 }, { "epoch": 0.11946902654867257, "grad_norm": 15.083377838134766, "learning_rate": 4.900333601449835e-07, "logits/chosen": -1.265625, "logits/rejected": -1.33203125, "logps/chosen": -266.0, "logps/rejected": -265.0, "loss": 0.4376, "rewards/accuracies": 0.6875, "rewards/chosen": 1.21484375, "rewards/margins": 1.05078125, "rewards/rejected": 0.162109375, "step": 108 }, { "epoch": 0.12057522123893805, "grad_norm": 17.161651611328125, "learning_rate": 4.89781187136772e-07, "logits/chosen": -1.25, "logits/rejected": -1.27734375, "logps/chosen": -254.0, "logps/rejected": -276.0, "loss": 0.4388, "rewards/accuracies": 0.71875, "rewards/chosen": 1.26171875, "rewards/margins": 1.09375, "rewards/rejected": 0.16796875, "step": 109 }, { "epoch": 0.12168141592920353, "grad_norm": 15.041230201721191, "learning_rate": 4.895259301867595e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.328125, "logps/chosen": -246.0, "logps/rejected": -280.0, "loss": 0.4269, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.3046875, "rewards/margins": 1.1171875, "rewards/rejected": 0.18603515625, "step": 110 }, { "epoch": 0.12278761061946902, "grad_norm": 13.183340072631836, "learning_rate": 4.892675925779292e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.36328125, "logps/chosen": -207.5, "logps/rejected": -250.0, "loss": 0.4122, "rewards/accuracies": 0.734375, "rewards/chosen": 1.328125, "rewards/margins": 1.26171875, "rewards/rejected": 0.0699310302734375, "step": 111 }, { "epoch": 0.12389380530973451, "grad_norm": 15.673736572265625, "learning_rate": 4.89006177632886e-07, "logits/chosen": -1.375, "logits/rejected": -1.37109375, "logps/chosen": -263.0, "logps/rejected": -272.0, "loss": 0.4412, "rewards/accuracies": 0.765625, "rewards/chosen": 1.24609375, "rewards/margins": 1.08203125, "rewards/rejected": 0.1640625, "step": 112 }, { "epoch": 0.125, "grad_norm": 14.536067008972168, "learning_rate": 4.887416887138138e-07, "logits/chosen": -1.1875, "logits/rejected": -1.15625, "logps/chosen": -257.5, "logps/rejected": -270.0, "loss": 0.4604, "rewards/accuracies": 0.671875, "rewards/chosen": 1.1953125, "rewards/margins": 1.09375, "rewards/rejected": 0.10595703125, "step": 113 }, { "epoch": 0.1261061946902655, "grad_norm": 14.573882102966309, "learning_rate": 4.884741292224326e-07, "logits/chosen": -1.296875, "logits/rejected": -1.34765625, "logps/chosen": -240.0, "logps/rejected": -268.5, "loss": 0.4091, "rewards/accuracies": 0.734375, "rewards/chosen": 1.265625, "rewards/margins": 1.16796875, "rewards/rejected": 0.09814453125, "step": 114 }, { "epoch": 0.12721238938053098, "grad_norm": 15.746649742126465, "learning_rate": 4.882035025999544e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.1875, "logps/chosen": -273.5, "logps/rejected": -270.5, "loss": 0.4313, "rewards/accuracies": 0.71875, "rewards/chosen": 1.15234375, "rewards/margins": 1.1640625, "rewards/rejected": -0.0108642578125, "step": 115 }, { "epoch": 0.12831858407079647, "grad_norm": 15.159698486328125, "learning_rate": 4.879298123270391e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.36328125, "logps/chosen": -244.5, "logps/rejected": -256.0, "loss": 0.4331, "rewards/accuracies": 0.6796875, "rewards/chosen": 1.23046875, "rewards/margins": 1.171875, "rewards/rejected": 0.0592041015625, "step": 116 }, { "epoch": 0.12942477876106195, "grad_norm": 14.491214752197266, "learning_rate": 4.876530619237495e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.2578125, "logps/chosen": -235.5, "logps/rejected": -233.5, "loss": 0.4267, "rewards/accuracies": 0.6953125, "rewards/chosen": 1.19140625, "rewards/margins": 1.20703125, "rewards/rejected": -0.018310546875, "step": 117 }, { "epoch": 0.13053097345132744, "grad_norm": 15.388319969177246, "learning_rate": 4.873732549495065e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.25, "logps/chosen": -263.0, "logps/rejected": -254.5, "loss": 0.4351, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.16796875, "rewards/margins": 1.1171875, "rewards/rejected": 0.05218505859375, "step": 118 }, { "epoch": 0.13163716814159293, "grad_norm": 14.733474731445312, "learning_rate": 4.870903950030428e-07, "logits/chosen": -1.35546875, "logits/rejected": -1.31640625, "logps/chosen": -241.5, "logps/rejected": -274.0, "loss": 0.376, "rewards/accuracies": 0.765625, "rewards/chosen": 1.390625, "rewards/margins": 1.30859375, "rewards/rejected": 0.086669921875, "step": 119 }, { "epoch": 0.13274336283185842, "grad_norm": 14.26261043548584, "learning_rate": 4.868044857223571e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.29296875, "logps/chosen": -248.5, "logps/rejected": -281.0, "loss": 0.3815, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.296875, "rewards/margins": 1.3046875, "rewards/rejected": -0.008544921875, "step": 120 }, { "epoch": 0.1338495575221239, "grad_norm": 15.668647766113281, "learning_rate": 4.865155307846669e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.4375, "logps/chosen": -232.0, "logps/rejected": -240.0, "loss": 0.4114, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.2734375, "rewards/margins": 1.2421875, "rewards/rejected": 0.0323486328125, "step": 121 }, { "epoch": 0.13495575221238937, "grad_norm": 16.117341995239258, "learning_rate": 4.862235339063613e-07, "logits/chosen": -1.35546875, "logits/rejected": -1.26171875, "logps/chosen": -252.5, "logps/rejected": -268.0, "loss": 0.4789, "rewards/accuracies": 0.6640625, "rewards/chosen": 1.10546875, "rewards/margins": 0.9375, "rewards/rejected": 0.169189453125, "step": 122 }, { "epoch": 0.13606194690265486, "grad_norm": 15.315237998962402, "learning_rate": 4.859284988429533e-07, "logits/chosen": -1.390625, "logits/rejected": -1.37109375, "logps/chosen": -264.0, "logps/rejected": -302.0, "loss": 0.4574, "rewards/accuracies": 0.6796875, "rewards/chosen": 1.09375, "rewards/margins": 1.0234375, "rewards/rejected": 0.0693359375, "step": 123 }, { "epoch": 0.13716814159292035, "grad_norm": 13.38134765625, "learning_rate": 4.856304293890317e-07, "logits/chosen": -1.25, "logits/rejected": -1.18359375, "logps/chosen": -255.0, "logps/rejected": -253.5, "loss": 0.3681, "rewards/accuracies": 0.765625, "rewards/chosen": 1.3984375, "rewards/margins": 1.484375, "rewards/rejected": -0.08575439453125, "step": 124 }, { "epoch": 0.13827433628318583, "grad_norm": 17.225801467895508, "learning_rate": 4.853293293782118e-07, "logits/chosen": -1.39453125, "logits/rejected": -1.4140625, "logps/chosen": -276.0, "logps/rejected": -280.0, "loss": 0.458, "rewards/accuracies": 0.6796875, "rewards/chosen": 1.1796875, "rewards/margins": 1.08203125, "rewards/rejected": 0.09991455078125, "step": 125 }, { "epoch": 0.13938053097345132, "grad_norm": 14.186132431030273, "learning_rate": 4.850252026830863e-07, "logits/chosen": -1.359375, "logits/rejected": -1.26953125, "logps/chosen": -234.5, "logps/rejected": -252.5, "loss": 0.4436, "rewards/accuracies": 0.734375, "rewards/chosen": 1.25390625, "rewards/margins": 1.123046875, "rewards/rejected": 0.1328125, "step": 126 }, { "epoch": 0.1404867256637168, "grad_norm": 14.477481842041016, "learning_rate": 4.84718053215176e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.23046875, "logps/chosen": -249.5, "logps/rejected": -256.0, "loss": 0.4314, "rewards/accuracies": 0.703125, "rewards/chosen": 1.296875, "rewards/margins": 1.140625, "rewards/rejected": 0.1552734375, "step": 127 }, { "epoch": 0.1415929203539823, "grad_norm": 15.153040885925293, "learning_rate": 4.844078849248785e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.3125, "logps/chosen": -260.0, "logps/rejected": -292.0, "loss": 0.3964, "rewards/accuracies": 0.75, "rewards/chosen": 1.421875, "rewards/margins": 1.37890625, "rewards/rejected": 0.0396728515625, "step": 128 }, { "epoch": 0.1426991150442478, "grad_norm": 14.35177230834961, "learning_rate": 4.840947018014182e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.19140625, "logps/chosen": -256.5, "logps/rejected": -251.5, "loss": 0.4107, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.33203125, "rewards/margins": 1.2734375, "rewards/rejected": 0.060028076171875, "step": 129 }, { "epoch": 0.14380530973451328, "grad_norm": 14.168734550476074, "learning_rate": 4.837785078727948e-07, "logits/chosen": -1.25, "logits/rejected": -1.19140625, "logps/chosen": -248.0, "logps/rejected": -284.0, "loss": 0.3812, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.3984375, "rewards/margins": 1.390625, "rewards/rejected": 0.00927734375, "step": 130 }, { "epoch": 0.14491150442477876, "grad_norm": 15.743026733398438, "learning_rate": 4.834593072057313e-07, "logits/chosen": -1.28125, "logits/rejected": -1.30078125, "logps/chosen": -246.0, "logps/rejected": -265.0, "loss": 0.4586, "rewards/accuracies": 0.6875, "rewards/chosen": 1.31640625, "rewards/margins": 1.08984375, "rewards/rejected": 0.2255859375, "step": 131 }, { "epoch": 0.14601769911504425, "grad_norm": 16.969074249267578, "learning_rate": 4.831371039056217e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.1484375, "logps/chosen": -275.0, "logps/rejected": -296.0, "loss": 0.4373, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.2109375, "rewards/margins": 1.19921875, "rewards/rejected": 0.0108642578125, "step": 132 }, { "epoch": 0.14712389380530974, "grad_norm": 14.101773262023926, "learning_rate": 4.828119021164786e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.296875, "logps/chosen": -246.5, "logps/rejected": -277.0, "loss": 0.3919, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.359375, "rewards/margins": 1.43359375, "rewards/rejected": -0.07568359375, "step": 133 }, { "epoch": 0.14823008849557523, "grad_norm": 15.83488941192627, "learning_rate": 4.824837060208795e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.265625, "logps/chosen": -275.0, "logps/rejected": -268.5, "loss": 0.4578, "rewards/accuracies": 0.703125, "rewards/chosen": 1.1953125, "rewards/margins": 0.998046875, "rewards/rejected": 0.193359375, "step": 134 }, { "epoch": 0.14933628318584072, "grad_norm": 13.669934272766113, "learning_rate": 4.82152519839913e-07, "logits/chosen": -1.390625, "logits/rejected": -1.2578125, "logps/chosen": -241.5, "logps/rejected": -243.5, "loss": 0.3765, "rewards/accuracies": 0.796875, "rewards/chosen": 1.4375, "rewards/margins": 1.40625, "rewards/rejected": 0.0296630859375, "step": 135 }, { "epoch": 0.1504424778761062, "grad_norm": 16.85657501220703, "learning_rate": 4.818183478331247e-07, "logits/chosen": -1.13671875, "logits/rejected": -1.25390625, "logps/chosen": -257.5, "logps/rejected": -277.5, "loss": 0.4258, "rewards/accuracies": 0.6953125, "rewards/chosen": 1.390625, "rewards/margins": 1.3125, "rewards/rejected": 0.0772705078125, "step": 136 }, { "epoch": 0.1515486725663717, "grad_norm": 15.21373462677002, "learning_rate": 4.814811942984625e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.1953125, "logps/chosen": -256.5, "logps/rejected": -240.0, "loss": 0.4232, "rewards/accuracies": 0.78125, "rewards/chosen": 1.19921875, "rewards/margins": 1.12890625, "rewards/rejected": 0.0693359375, "step": 137 }, { "epoch": 0.15265486725663716, "grad_norm": 13.69796085357666, "learning_rate": 4.811410635722209e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.21875, "logps/chosen": -236.5, "logps/rejected": -257.0, "loss": 0.3722, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.4453125, "rewards/margins": 1.55078125, "rewards/rejected": -0.10595703125, "step": 138 }, { "epoch": 0.15376106194690264, "grad_norm": 15.000753402709961, "learning_rate": 4.807979600289857e-07, "logits/chosen": -1.21875, "logits/rejected": -1.27734375, "logps/chosen": -274.0, "logps/rejected": -297.0, "loss": 0.3709, "rewards/accuracies": 0.75, "rewards/chosen": 1.30078125, "rewards/margins": 1.515625, "rewards/rejected": -0.21240234375, "step": 139 }, { "epoch": 0.15486725663716813, "grad_norm": 13.44487476348877, "learning_rate": 4.804518880815776e-07, "logits/chosen": -1.15625, "logits/rejected": -1.27734375, "logps/chosen": -248.5, "logps/rejected": -267.5, "loss": 0.3818, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.37109375, "rewards/margins": 1.515625, "rewards/rejected": -0.144775390625, "step": 140 }, { "epoch": 0.15597345132743362, "grad_norm": 15.1209135055542, "learning_rate": 4.801028521809951e-07, "logits/chosen": -1.21875, "logits/rejected": -1.16796875, "logps/chosen": -273.0, "logps/rejected": -271.5, "loss": 0.4027, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.19140625, "rewards/margins": 1.3125, "rewards/rejected": -0.122802734375, "step": 141 }, { "epoch": 0.1570796460176991, "grad_norm": 16.363567352294922, "learning_rate": 4.797508568163578e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.2109375, "logps/chosen": -262.0, "logps/rejected": -269.0, "loss": 0.4581, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.072265625, "rewards/margins": 1.169921875, "rewards/rejected": -0.097412109375, "step": 142 }, { "epoch": 0.1581858407079646, "grad_norm": 13.670063972473145, "learning_rate": 4.793959065148484e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.2421875, "logps/chosen": -240.0, "logps/rejected": -254.5, "loss": 0.3719, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.296875, "rewards/margins": 1.4765625, "rewards/rejected": -0.179443359375, "step": 143 }, { "epoch": 0.1592920353982301, "grad_norm": 14.17078971862793, "learning_rate": 4.790380058416542e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.23046875, "logps/chosen": -240.0, "logps/rejected": -259.5, "loss": 0.3726, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.35546875, "rewards/margins": 1.625, "rewards/rejected": -0.2666015625, "step": 144 }, { "epoch": 0.16039823008849557, "grad_norm": 13.858586311340332, "learning_rate": 4.786771593999089e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.25, "logps/chosen": -242.5, "logps/rejected": -251.5, "loss": 0.377, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.3515625, "rewards/margins": 1.5078125, "rewards/rejected": -0.154296875, "step": 145 }, { "epoch": 0.16150442477876106, "grad_norm": 15.108954429626465, "learning_rate": 4.783133718306331e-07, "logits/chosen": -1.21875, "logits/rejected": -1.28125, "logps/chosen": -266.0, "logps/rejected": -305.0, "loss": 0.4185, "rewards/accuracies": 0.765625, "rewards/chosen": 1.21875, "rewards/margins": 1.37109375, "rewards/rejected": -0.15087890625, "step": 146 }, { "epoch": 0.16261061946902655, "grad_norm": 14.861040115356445, "learning_rate": 4.779466478126746e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.30078125, "logps/chosen": -242.0, "logps/rejected": -239.5, "loss": 0.3849, "rewards/accuracies": 0.765625, "rewards/chosen": 1.140625, "rewards/margins": 1.4296875, "rewards/rejected": -0.2919921875, "step": 147 }, { "epoch": 0.16371681415929204, "grad_norm": 14.671175956726074, "learning_rate": 4.775769920626483e-07, "logits/chosen": -1.37890625, "logits/rejected": -1.27734375, "logps/chosen": -238.5, "logps/rejected": -251.0, "loss": 0.4109, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1484375, "rewards/margins": 1.27734375, "rewards/rejected": -0.129150390625, "step": 148 }, { "epoch": 0.16482300884955753, "grad_norm": 13.885614395141602, "learning_rate": 4.772044093348757e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.23828125, "logps/chosen": -245.5, "logps/rejected": -247.0, "loss": 0.4042, "rewards/accuracies": 0.75, "rewards/chosen": 1.162109375, "rewards/margins": 1.361328125, "rewards/rejected": -0.19720458984375, "step": 149 }, { "epoch": 0.16592920353982302, "grad_norm": 15.551752090454102, "learning_rate": 4.7682890442132336e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.2265625, "logps/chosen": -255.0, "logps/rejected": -252.0, "loss": 0.415, "rewards/accuracies": 0.71875, "rewards/chosen": 1.095703125, "rewards/margins": 1.37890625, "rewards/rejected": -0.28369140625, "step": 150 }, { "epoch": 0.16592920353982302, "eval_logits/chosen": -1.2870413064956665, "eval_logits/rejected": -1.2431591749191284, "eval_logps/chosen": -248.97512817382812, "eval_logps/rejected": -261.86566162109375, "eval_loss": 0.39721065759658813, "eval_rewards/accuracies": 0.7473672032356262, "eval_rewards/chosen": 1.184818148612976, "eval_rewards/margins": 1.4427666664123535, "eval_rewards/rejected": -0.25743111968040466, "eval_runtime": 193.0648, "eval_samples_per_second": 66.573, "eval_steps_per_second": 1.041, "step": 150 }, { "epoch": 0.1670353982300885, "grad_norm": 15.598936080932617, "learning_rate": 4.7645048215154156e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.2890625, "logps/chosen": -242.0, "logps/rejected": -260.0, "loss": 0.4404, "rewards/accuracies": 0.6875, "rewards/chosen": 1.146484375, "rewards/margins": 1.39453125, "rewards/rejected": -0.24755859375, "step": 151 }, { "epoch": 0.168141592920354, "grad_norm": 13.759398460388184, "learning_rate": 4.760691473926021e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.234375, "logps/chosen": -248.5, "logps/rejected": -269.0, "loss": 0.3753, "rewards/accuracies": 0.75, "rewards/chosen": 1.2890625, "rewards/margins": 1.55859375, "rewards/rejected": -0.265625, "step": 152 }, { "epoch": 0.16924778761061948, "grad_norm": 17.32530975341797, "learning_rate": 4.756849050490357e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.1640625, "logps/chosen": -287.0, "logps/rejected": -302.0, "loss": 0.4487, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.01953125, "rewards/margins": 1.23828125, "rewards/rejected": -0.21826171875, "step": 153 }, { "epoch": 0.17035398230088494, "grad_norm": 16.289810180664062, "learning_rate": 4.75297760062769e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.296875, "logps/chosen": -271.0, "logps/rejected": -266.5, "loss": 0.4189, "rewards/accuracies": 0.703125, "rewards/chosen": 1.0703125, "rewards/margins": 1.375, "rewards/rejected": -0.30419921875, "step": 154 }, { "epoch": 0.17146017699115043, "grad_norm": 15.245888710021973, "learning_rate": 4.749077174130608e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.23828125, "logps/chosen": -264.0, "logps/rejected": -282.0, "loss": 0.4183, "rewards/accuracies": 0.71875, "rewards/chosen": 1.16015625, "rewards/margins": 1.40234375, "rewards/rejected": -0.240234375, "step": 155 }, { "epoch": 0.17256637168141592, "grad_norm": 14.452110290527344, "learning_rate": 4.7451478211643835e-07, "logits/chosen": -1.39453125, "logits/rejected": -1.30859375, "logps/chosen": -253.0, "logps/rejected": -256.0, "loss": 0.3993, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.2421875, "rewards/margins": 1.44921875, "rewards/rejected": -0.20654296875, "step": 156 }, { "epoch": 0.1736725663716814, "grad_norm": 14.378584861755371, "learning_rate": 4.741189592266325e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.26171875, "logps/chosen": -231.5, "logps/rejected": -273.5, "loss": 0.3664, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.3125, "rewards/margins": 1.70703125, "rewards/rejected": -0.39453125, "step": 157 }, { "epoch": 0.1747787610619469, "grad_norm": 13.193842887878418, "learning_rate": 4.7372025383451274e-07, "logits/chosen": -1.12109375, "logits/rejected": -1.203125, "logps/chosen": -240.0, "logps/rejected": -260.0, "loss": 0.3485, "rewards/accuracies": 0.78125, "rewards/chosen": 1.12109375, "rewards/margins": 1.6171875, "rewards/rejected": -0.4912109375, "step": 158 }, { "epoch": 0.17588495575221239, "grad_norm": 13.745351791381836, "learning_rate": 4.7331867106802204e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.1875, "logps/chosen": -258.5, "logps/rejected": -265.0, "loss": 0.3891, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.23046875, "rewards/margins": 1.5859375, "rewards/rejected": -0.35546875, "step": 159 }, { "epoch": 0.17699115044247787, "grad_norm": 14.0711669921875, "learning_rate": 4.7291421609211045e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.24609375, "logps/chosen": -251.5, "logps/rejected": -282.0, "loss": 0.3999, "rewards/accuracies": 0.75, "rewards/chosen": 1.1875, "rewards/margins": 1.3984375, "rewards/rejected": -0.20654296875, "step": 160 }, { "epoch": 0.17809734513274336, "grad_norm": 13.304108619689941, "learning_rate": 4.725068941086692e-07, "logits/chosen": -1.37109375, "logits/rejected": -1.1953125, "logps/chosen": -255.5, "logps/rejected": -262.0, "loss": 0.3558, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.2265625, "rewards/margins": 1.63671875, "rewards/rejected": -0.4140625, "step": 161 }, { "epoch": 0.17920353982300885, "grad_norm": 13.896252632141113, "learning_rate": 4.7209671035646304e-07, "logits/chosen": -1.3125, "logits/rejected": -1.2265625, "logps/chosen": -248.5, "logps/rejected": -264.0, "loss": 0.3942, "rewards/accuracies": 0.75, "rewards/chosen": 1.15234375, "rewards/margins": 1.42578125, "rewards/rejected": -0.27294921875, "step": 162 }, { "epoch": 0.18030973451327434, "grad_norm": 14.796649932861328, "learning_rate": 4.7168367011106367e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.30078125, "logps/chosen": -245.0, "logps/rejected": -262.5, "loss": 0.3799, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.13671875, "rewards/margins": 1.5703125, "rewards/rejected": -0.431640625, "step": 163 }, { "epoch": 0.18141592920353983, "grad_norm": 16.078460693359375, "learning_rate": 4.712677786847814e-07, "logits/chosen": -1.44140625, "logits/rejected": -1.1875, "logps/chosen": -243.5, "logps/rejected": -250.0, "loss": 0.4507, "rewards/accuracies": 0.703125, "rewards/chosen": 1.044921875, "rewards/margins": 1.2265625, "rewards/rejected": -0.18359375, "step": 164 }, { "epoch": 0.18252212389380532, "grad_norm": 13.583531379699707, "learning_rate": 4.708490414265971e-07, "logits/chosen": -1.375, "logits/rejected": -1.1796875, "logps/chosen": -262.0, "logps/rejected": -272.5, "loss": 0.3486, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.18359375, "rewards/margins": 1.63671875, "rewards/rejected": -0.4521484375, "step": 165 }, { "epoch": 0.1836283185840708, "grad_norm": 14.29465389251709, "learning_rate": 4.7042746372209296e-07, "logits/chosen": -1.25, "logits/rejected": -1.32421875, "logps/chosen": -249.5, "logps/rejected": -278.0, "loss": 0.357, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.3046875, "rewards/margins": 1.71875, "rewards/rejected": -0.416015625, "step": 166 }, { "epoch": 0.1847345132743363, "grad_norm": 14.11926555633545, "learning_rate": 4.700030509933839e-07, "logits/chosen": -1.12890625, "logits/rejected": -1.1484375, "logps/chosen": -235.5, "logps/rejected": -273.0, "loss": 0.3775, "rewards/accuracies": 0.796875, "rewards/chosen": 1.33984375, "rewards/margins": 1.60546875, "rewards/rejected": -0.2646484375, "step": 167 }, { "epoch": 0.18584070796460178, "grad_norm": 13.987667083740234, "learning_rate": 4.6957580869904707e-07, "logits/chosen": -1.234375, "logits/rejected": -1.140625, "logps/chosen": -266.0, "logps/rejected": -280.0, "loss": 0.3593, "rewards/accuracies": 0.796875, "rewards/chosen": 1.15625, "rewards/margins": 1.55078125, "rewards/rejected": -0.3935546875, "step": 168 }, { "epoch": 0.18694690265486727, "grad_norm": 14.725763320922852, "learning_rate": 4.691457423340524e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.076171875, "logps/chosen": -261.0, "logps/rejected": -248.5, "loss": 0.3935, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.953125, "rewards/margins": 1.39453125, "rewards/rejected": -0.439453125, "step": 169 }, { "epoch": 0.18805309734513273, "grad_norm": 15.593293190002441, "learning_rate": 4.6871285742969114e-07, "logits/chosen": -1.171875, "logits/rejected": -1.21875, "logps/chosen": -267.0, "logps/rejected": -278.0, "loss": 0.4233, "rewards/accuracies": 0.6875, "rewards/chosen": 1.23828125, "rewards/margins": 1.5234375, "rewards/rejected": -0.279296875, "step": 170 }, { "epoch": 0.18915929203539822, "grad_norm": 13.978684425354004, "learning_rate": 4.682771595535056e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.3046875, "logps/chosen": -244.5, "logps/rejected": -274.0, "loss": 0.3605, "rewards/accuracies": 0.8125, "rewards/chosen": 1.41796875, "rewards/margins": 1.6171875, "rewards/rejected": -0.19677734375, "step": 171 }, { "epoch": 0.1902654867256637, "grad_norm": 12.64192008972168, "learning_rate": 4.678386543092168e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.16015625, "logps/chosen": -243.5, "logps/rejected": -267.0, "loss": 0.35, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.32421875, "rewards/margins": 1.8515625, "rewards/rejected": -0.52734375, "step": 172 }, { "epoch": 0.1913716814159292, "grad_norm": 15.251437187194824, "learning_rate": 4.673973473366527e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.2578125, "logps/chosen": -252.5, "logps/rejected": -270.5, "loss": 0.386, "rewards/accuracies": 0.71875, "rewards/chosen": 1.33984375, "rewards/margins": 1.71875, "rewards/rejected": -0.3818359375, "step": 173 }, { "epoch": 0.19247787610619468, "grad_norm": 11.346704483032227, "learning_rate": 4.669532443116757e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.20703125, "logps/chosen": -227.0, "logps/rejected": -244.5, "loss": 0.2852, "rewards/accuracies": 0.84375, "rewards/chosen": 1.46484375, "rewards/margins": 2.0625, "rewards/rejected": -0.59765625, "step": 174 }, { "epoch": 0.19358407079646017, "grad_norm": 17.457523345947266, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -1.34375, "logits/rejected": -1.12109375, "logps/chosen": -280.0, "logps/rejected": -277.0, "loss": 0.4692, "rewards/accuracies": 0.6953125, "rewards/chosen": 1.05859375, "rewards/margins": 1.22265625, "rewards/rejected": -0.166748046875, "step": 175 }, { "epoch": 0.19469026548672566, "grad_norm": 14.530098915100098, "learning_rate": 4.6605667298766607e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.21875, "logps/chosen": -241.5, "logps/rejected": -260.0, "loss": 0.3907, "rewards/accuracies": 0.71875, "rewards/chosen": 1.17578125, "rewards/margins": 1.67578125, "rewards/rejected": -0.501953125, "step": 176 }, { "epoch": 0.19579646017699115, "grad_norm": 15.266855239868164, "learning_rate": 4.656042162198708e-07, "logits/chosen": -1.43359375, "logits/rejected": -1.3046875, "logps/chosen": -235.0, "logps/rejected": -265.0, "loss": 0.4364, "rewards/accuracies": 0.75, "rewards/chosen": 1.0703125, "rewards/margins": 1.4296875, "rewards/rejected": -0.357421875, "step": 177 }, { "epoch": 0.19690265486725664, "grad_norm": 12.054651260375977, "learning_rate": 4.6514898646198896e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.26171875, "logps/chosen": -257.0, "logps/rejected": -271.0, "loss": 0.3194, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.2734375, "rewards/margins": 1.91015625, "rewards/rejected": -0.6328125, "step": 178 }, { "epoch": 0.19800884955752213, "grad_norm": 15.28715705871582, "learning_rate": 4.6469098956895076e-07, "logits/chosen": -1.3125, "logits/rejected": -1.22265625, "logps/chosen": -265.5, "logps/rejected": -277.0, "loss": 0.3848, "rewards/accuracies": 0.765625, "rewards/chosen": 1.28125, "rewards/margins": 1.65234375, "rewards/rejected": -0.373046875, "step": 179 }, { "epoch": 0.19911504424778761, "grad_norm": 14.788736343383789, "learning_rate": 4.6423023143127557e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.3984375, "logps/chosen": -252.0, "logps/rejected": -272.0, "loss": 0.3994, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.0859375, "rewards/margins": 1.5, "rewards/rejected": -0.4150390625, "step": 180 }, { "epoch": 0.2002212389380531, "grad_norm": 14.42548942565918, "learning_rate": 4.637667179749968e-07, "logits/chosen": -1.23046875, "logits/rejected": -1.21484375, "logps/chosen": -272.5, "logps/rejected": -274.5, "loss": 0.3871, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0390625, "rewards/margins": 1.515625, "rewards/rejected": -0.48046875, "step": 181 }, { "epoch": 0.2013274336283186, "grad_norm": 13.830480575561523, "learning_rate": 4.63300455161585e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.19140625, "logps/chosen": -250.0, "logps/rejected": -248.0, "loss": 0.3167, "rewards/accuracies": 0.796875, "rewards/chosen": 1.34375, "rewards/margins": 1.92578125, "rewards/rejected": -0.5859375, "step": 182 }, { "epoch": 0.20243362831858408, "grad_norm": 14.639776229858398, "learning_rate": 4.6283144898787174e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.20703125, "logps/chosen": -247.5, "logps/rejected": -279.0, "loss": 0.3672, "rewards/accuracies": 0.78125, "rewards/chosen": 1.32421875, "rewards/margins": 1.79296875, "rewards/rejected": -0.46484375, "step": 183 }, { "epoch": 0.20353982300884957, "grad_norm": 13.662202835083008, "learning_rate": 4.6235970548597224e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.234375, "logps/chosen": -231.0, "logps/rejected": -240.0, "loss": 0.3531, "rewards/accuracies": 0.75, "rewards/chosen": 1.28125, "rewards/margins": 1.78515625, "rewards/rejected": -0.505859375, "step": 184 }, { "epoch": 0.20464601769911506, "grad_norm": 13.101706504821777, "learning_rate": 4.6188523072320777e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.14453125, "logps/chosen": -253.0, "logps/rejected": -273.0, "loss": 0.3276, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.2421875, "rewards/margins": 1.82421875, "rewards/rejected": -0.5830078125, "step": 185 }, { "epoch": 0.20575221238938052, "grad_norm": 16.33759307861328, "learning_rate": 4.614080308020277e-07, "logits/chosen": -1.25, "logits/rejected": -1.2265625, "logps/chosen": -258.0, "logps/rejected": -290.0, "loss": 0.3694, "rewards/accuracies": 0.78125, "rewards/chosen": 1.203125, "rewards/margins": 1.69140625, "rewards/rejected": -0.48828125, "step": 186 }, { "epoch": 0.206858407079646, "grad_norm": 13.627776145935059, "learning_rate": 4.609281118599311e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.21875, "logps/chosen": -238.5, "logps/rejected": -239.0, "loss": 0.4007, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.056640625, "rewards/margins": 1.55078125, "rewards/rejected": -0.4931640625, "step": 187 }, { "epoch": 0.2079646017699115, "grad_norm": 13.673922538757324, "learning_rate": 4.6044548006938734e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.1796875, "logps/chosen": -247.5, "logps/rejected": -254.5, "loss": 0.3592, "rewards/accuracies": 0.78125, "rewards/chosen": 1.29296875, "rewards/margins": 1.7265625, "rewards/rejected": -0.4375, "step": 188 }, { "epoch": 0.20907079646017698, "grad_norm": 14.20157527923584, "learning_rate": 4.5996014163775745e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.26953125, "logps/chosen": -268.5, "logps/rejected": -272.0, "loss": 0.3429, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.32421875, "rewards/margins": 1.84765625, "rewards/rejected": -0.5234375, "step": 189 }, { "epoch": 0.21017699115044247, "grad_norm": 14.90439510345459, "learning_rate": 4.5947210280721353e-07, "logits/chosen": -1.34375, "logits/rejected": -1.2421875, "logps/chosen": -248.0, "logps/rejected": -285.0, "loss": 0.373, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.203125, "rewards/margins": 1.765625, "rewards/rejected": -0.5625, "step": 190 }, { "epoch": 0.21128318584070796, "grad_norm": 14.063448905944824, "learning_rate": 4.589813698546592e-07, "logits/chosen": -1.34375, "logits/rejected": -1.1328125, "logps/chosen": -256.0, "logps/rejected": -274.0, "loss": 0.3471, "rewards/accuracies": 0.8125, "rewards/chosen": 1.21875, "rewards/margins": 1.828125, "rewards/rejected": -0.607421875, "step": 191 }, { "epoch": 0.21238938053097345, "grad_norm": 13.391234397888184, "learning_rate": 4.584879490916481e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.234375, "logps/chosen": -247.5, "logps/rejected": -241.5, "loss": 0.356, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.0390625, "rewards/margins": 1.9296875, "rewards/rejected": -0.892578125, "step": 192 }, { "epoch": 0.21349557522123894, "grad_norm": 13.415105819702148, "learning_rate": 4.5799184686430343e-07, "logits/chosen": -1.23046875, "logits/rejected": -1.09375, "logps/chosen": -251.0, "logps/rejected": -257.5, "loss": 0.34, "rewards/accuracies": 0.75, "rewards/chosen": 1.1875, "rewards/margins": 1.87109375, "rewards/rejected": -0.689453125, "step": 193 }, { "epoch": 0.21460176991150443, "grad_norm": 13.00170612335205, "learning_rate": 4.574930695532356e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.32421875, "logps/chosen": -257.0, "logps/rejected": -273.0, "loss": 0.3455, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.203125, "rewards/margins": 1.82421875, "rewards/rejected": -0.623046875, "step": 194 }, { "epoch": 0.2157079646017699, "grad_norm": 13.366878509521484, "learning_rate": 4.569916235734611e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.19921875, "logps/chosen": -240.5, "logps/rejected": -272.0, "loss": 0.3792, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.2109375, "rewards/margins": 1.7578125, "rewards/rejected": -0.544921875, "step": 195 }, { "epoch": 0.2168141592920354, "grad_norm": 14.402266502380371, "learning_rate": 4.5648751537431897e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.171875, "logps/chosen": -250.5, "logps/rejected": -286.0, "loss": 0.428, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9453125, "rewards/margins": 1.3984375, "rewards/rejected": -0.4560546875, "step": 196 }, { "epoch": 0.2179203539823009, "grad_norm": 15.003867149353027, "learning_rate": 4.559807514393885e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.2421875, "logps/chosen": -276.5, "logps/rejected": -286.0, "loss": 0.35, "rewards/accuracies": 0.75, "rewards/chosen": 1.296875, "rewards/margins": 1.87109375, "rewards/rejected": -0.5810546875, "step": 197 }, { "epoch": 0.21902654867256638, "grad_norm": 14.217790603637695, "learning_rate": 4.5547133828640595e-07, "logits/chosen": -1.25, "logits/rejected": -1.24609375, "logps/chosen": -267.0, "logps/rejected": -266.0, "loss": 0.3393, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.27734375, "rewards/margins": 1.83203125, "rewards/rejected": -0.5556640625, "step": 198 }, { "epoch": 0.22013274336283187, "grad_norm": 13.773700714111328, "learning_rate": 4.5495928246717995e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.28125, "logps/chosen": -265.0, "logps/rejected": -292.0, "loss": 0.3351, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.375, "rewards/margins": 2.0546875, "rewards/rejected": -0.67578125, "step": 199 }, { "epoch": 0.22123893805309736, "grad_norm": 14.733463287353516, "learning_rate": 4.544445905675081e-07, "logits/chosen": -1.265625, "logits/rejected": -1.18359375, "logps/chosen": -266.0, "logps/rejected": -281.5, "loss": 0.3673, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0859375, "rewards/margins": 1.7265625, "rewards/rejected": -0.642578125, "step": 200 }, { "epoch": 0.22123893805309736, "eval_logits/chosen": -1.276119351387024, "eval_logits/rejected": -1.2190414667129517, "eval_logps/chosen": -249.2039794921875, "eval_logps/rejected": -265.0248718261719, "eval_loss": 0.37490636110305786, "eval_rewards/accuracies": 0.7651365399360657, "eval_rewards/chosen": 1.1634211540222168, "eval_rewards/margins": 1.7447527647018433, "eval_rewards/rejected": -0.5807631611824036, "eval_runtime": 192.9266, "eval_samples_per_second": 66.621, "eval_steps_per_second": 1.042, "step": 200 }, { "epoch": 0.22234513274336284, "grad_norm": 14.183818817138672, "learning_rate": 4.539272692070919e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.203125, "logps/chosen": -270.0, "logps/rejected": -238.5, "loss": 0.3398, "rewards/accuracies": 0.828125, "rewards/chosen": 1.2421875, "rewards/margins": 1.84375, "rewards/rejected": -0.6015625, "step": 201 }, { "epoch": 0.2234513274336283, "grad_norm": 14.671875953674316, "learning_rate": 4.534073250394515e-07, "logits/chosen": -1.40625, "logits/rejected": -1.25390625, "logps/chosen": -245.5, "logps/rejected": -261.5, "loss": 0.4247, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.904296875, "rewards/margins": 1.375, "rewards/rejected": -0.47265625, "step": 202 }, { "epoch": 0.2245575221238938, "grad_norm": 14.409346580505371, "learning_rate": 4.5288476475184025e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.15234375, "logps/chosen": -251.5, "logps/rejected": -259.0, "loss": 0.3738, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.10546875, "rewards/margins": 1.73046875, "rewards/rejected": -0.623046875, "step": 203 }, { "epoch": 0.22566371681415928, "grad_norm": 16.879392623901367, "learning_rate": 4.523595950651587e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.25390625, "logps/chosen": -272.0, "logps/rejected": -281.0, "loss": 0.4152, "rewards/accuracies": 0.75, "rewards/chosen": 1.0546875, "rewards/margins": 1.640625, "rewards/rejected": -0.58203125, "step": 204 }, { "epoch": 0.22676991150442477, "grad_norm": 13.093546867370605, "learning_rate": 4.518318227338681e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.1953125, "logps/chosen": -272.0, "logps/rejected": -275.0, "loss": 0.3398, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.07421875, "rewards/margins": 1.71484375, "rewards/rejected": -0.640625, "step": 205 }, { "epoch": 0.22787610619469026, "grad_norm": 14.883780479431152, "learning_rate": 4.5130145454590374e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.140625, "logps/chosen": -247.0, "logps/rejected": -279.0, "loss": 0.3714, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.12890625, "rewards/margins": 1.84375, "rewards/rejected": -0.71875, "step": 206 }, { "epoch": 0.22898230088495575, "grad_norm": 13.462334632873535, "learning_rate": 4.5076849732258737e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.1953125, "logps/chosen": -233.0, "logps/rejected": -231.5, "loss": 0.3624, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2578125, "rewards/margins": 1.75, "rewards/rejected": -0.4931640625, "step": 207 }, { "epoch": 0.23008849557522124, "grad_norm": 13.485892295837402, "learning_rate": 4.5023295791853937e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.25390625, "logps/chosen": -243.0, "logps/rejected": -284.0, "loss": 0.3465, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.36328125, "rewards/margins": 1.86328125, "rewards/rejected": -0.4951171875, "step": 208 }, { "epoch": 0.23119469026548672, "grad_norm": 13.468306541442871, "learning_rate": 4.496948432215912e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.14453125, "logps/chosen": -239.0, "logps/rejected": -231.5, "loss": 0.3881, "rewards/accuracies": 0.828125, "rewards/chosen": 1.07421875, "rewards/margins": 1.6484375, "rewards/rejected": -0.576171875, "step": 209 }, { "epoch": 0.2323008849557522, "grad_norm": 14.274983406066895, "learning_rate": 4.4915416015269614e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.265625, "logps/chosen": -271.0, "logps/rejected": -279.5, "loss": 0.3449, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.24609375, "rewards/margins": 1.875, "rewards/rejected": -0.6298828125, "step": 210 }, { "epoch": 0.2334070796460177, "grad_norm": 14.726081848144531, "learning_rate": 4.486109156658405e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.30078125, "logps/chosen": -223.0, "logps/rejected": -258.0, "loss": 0.3548, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.26953125, "rewards/margins": 1.83203125, "rewards/rejected": -0.5556640625, "step": 211 }, { "epoch": 0.2345132743362832, "grad_norm": 14.424053192138672, "learning_rate": 4.480651167479544e-07, "logits/chosen": -1.328125, "logits/rejected": -1.18359375, "logps/chosen": -235.5, "logps/rejected": -251.0, "loss": 0.3725, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.3125, "rewards/margins": 1.7109375, "rewards/rejected": -0.3974609375, "step": 212 }, { "epoch": 0.23561946902654868, "grad_norm": 16.50137710571289, "learning_rate": 4.475167704188218e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.33203125, "logps/chosen": -261.0, "logps/rejected": -274.0, "loss": 0.4309, "rewards/accuracies": 0.765625, "rewards/chosen": 1.0625, "rewards/margins": 1.57421875, "rewards/rejected": -0.513671875, "step": 213 }, { "epoch": 0.23672566371681417, "grad_norm": 13.223847389221191, "learning_rate": 4.4696588373098973e-07, "logits/chosen": -1.203125, "logits/rejected": -1.26953125, "logps/chosen": -246.0, "logps/rejected": -262.5, "loss": 0.3152, "rewards/accuracies": 0.828125, "rewards/chosen": 1.40234375, "rewards/margins": 2.12109375, "rewards/rejected": -0.7177734375, "step": 214 }, { "epoch": 0.23783185840707965, "grad_norm": 15.553281784057617, "learning_rate": 4.4641246376967854e-07, "logits/chosen": -1.19140625, "logits/rejected": -1.1640625, "logps/chosen": -256.5, "logps/rejected": -271.0, "loss": 0.3849, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.203125, "rewards/margins": 1.73046875, "rewards/rejected": -0.52734375, "step": 215 }, { "epoch": 0.23893805309734514, "grad_norm": 14.652776718139648, "learning_rate": 4.4585651765268983e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.16796875, "logps/chosen": -249.0, "logps/rejected": -240.0, "loss": 0.394, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.15625, "rewards/margins": 1.73828125, "rewards/rejected": -0.5859375, "step": 216 }, { "epoch": 0.24004424778761063, "grad_norm": 15.165270805358887, "learning_rate": 4.452980525303155e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.22265625, "logps/chosen": -272.5, "logps/rejected": -269.0, "loss": 0.3583, "rewards/accuracies": 0.8125, "rewards/chosen": 1.25, "rewards/margins": 1.81640625, "rewards/rejected": -0.56640625, "step": 217 }, { "epoch": 0.2411504424778761, "grad_norm": 13.010436058044434, "learning_rate": 4.4473707558524553e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.1640625, "logps/chosen": -248.5, "logps/rejected": -276.0, "loss": 0.3244, "rewards/accuracies": 0.796875, "rewards/chosen": 1.30859375, "rewards/margins": 2.05859375, "rewards/rejected": -0.751953125, "step": 218 }, { "epoch": 0.24225663716814158, "grad_norm": 14.902968406677246, "learning_rate": 4.4417359403247567e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.10546875, "logps/chosen": -255.0, "logps/rejected": -276.0, "loss": 0.3569, "rewards/accuracies": 0.75, "rewards/chosen": 1.453125, "rewards/margins": 2.0234375, "rewards/rejected": -0.568359375, "step": 219 }, { "epoch": 0.24336283185840707, "grad_norm": 13.878840446472168, "learning_rate": 4.436076151192146e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.26953125, "logps/chosen": -218.0, "logps/rejected": -246.5, "loss": 0.3976, "rewards/accuracies": 0.703125, "rewards/chosen": 1.1484375, "rewards/margins": 1.8046875, "rewards/rejected": -0.654296875, "step": 220 }, { "epoch": 0.24446902654867256, "grad_norm": 13.981918334960938, "learning_rate": 4.4303914612479104e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.25, "logps/chosen": -237.0, "logps/rejected": -273.0, "loss": 0.3427, "rewards/accuracies": 0.78125, "rewards/chosen": 1.36328125, "rewards/margins": 2.0390625, "rewards/rejected": -0.673828125, "step": 221 }, { "epoch": 0.24557522123893805, "grad_norm": 12.754227638244629, "learning_rate": 4.4246819436055946e-07, "logits/chosen": -1.234375, "logits/rejected": -1.140625, "logps/chosen": -248.0, "logps/rejected": -250.0, "loss": 0.3383, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.109375, "rewards/margins": 1.87890625, "rewards/rejected": -0.76171875, "step": 222 }, { "epoch": 0.24668141592920353, "grad_norm": 14.764009475708008, "learning_rate": 4.418947671698066e-07, "logits/chosen": -1.34375, "logits/rejected": -1.234375, "logps/chosen": -250.5, "logps/rejected": -266.0, "loss": 0.3845, "rewards/accuracies": 0.8125, "rewards/chosen": 1.060546875, "rewards/margins": 1.7265625, "rewards/rejected": -0.66015625, "step": 223 }, { "epoch": 0.24778761061946902, "grad_norm": 15.158235549926758, "learning_rate": 4.4131887192765684e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.26953125, "logps/chosen": -244.0, "logps/rejected": -265.0, "loss": 0.3368, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.19140625, "rewards/margins": 2.09375, "rewards/rejected": -0.904296875, "step": 224 }, { "epoch": 0.2488938053097345, "grad_norm": 13.44605827331543, "learning_rate": 4.4074051604097753e-07, "logits/chosen": -1.265625, "logits/rejected": -1.26171875, "logps/chosen": -248.0, "logps/rejected": -269.0, "loss": 0.3464, "rewards/accuracies": 0.796875, "rewards/chosen": 1.28515625, "rewards/margins": 2.03125, "rewards/rejected": -0.744140625, "step": 225 }, { "epoch": 0.25, "grad_norm": 15.778076171875, "learning_rate": 4.401597069482832e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.26171875, "logps/chosen": -248.5, "logps/rejected": -265.0, "loss": 0.4139, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.962890625, "rewards/margins": 1.66015625, "rewards/rejected": -0.697265625, "step": 226 }, { "epoch": 0.25110619469026546, "grad_norm": 13.870752334594727, "learning_rate": 4.395764521196406e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.2265625, "logps/chosen": -234.5, "logps/rejected": -281.0, "loss": 0.3158, "rewards/accuracies": 0.796875, "rewards/chosen": 1.3984375, "rewards/margins": 2.2734375, "rewards/rejected": -0.87109375, "step": 227 }, { "epoch": 0.252212389380531, "grad_norm": 13.615601539611816, "learning_rate": 4.389907590565721e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.15234375, "logps/chosen": -268.0, "logps/rejected": -290.0, "loss": 0.3724, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.087890625, "rewards/margins": 1.91796875, "rewards/rejected": -0.830078125, "step": 228 }, { "epoch": 0.25331858407079644, "grad_norm": 14.186120986938477, "learning_rate": 4.3840263529195943e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.2109375, "logps/chosen": -248.5, "logps/rejected": -262.0, "loss": 0.3415, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.072265625, "rewards/margins": 1.9375, "rewards/rejected": -0.865234375, "step": 229 }, { "epoch": 0.25442477876106195, "grad_norm": 12.267884254455566, "learning_rate": 4.3781208838994663e-07, "logits/chosen": -1.296875, "logits/rejected": -1.23828125, "logps/chosen": -246.0, "logps/rejected": -257.5, "loss": 0.3271, "rewards/accuracies": 0.796875, "rewards/chosen": 1.15234375, "rewards/margins": 1.97265625, "rewards/rejected": -0.8203125, "step": 230 }, { "epoch": 0.2555309734513274, "grad_norm": 14.3861722946167, "learning_rate": 4.372191259458432e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.21875, "logps/chosen": -234.5, "logps/rejected": -251.0, "loss": 0.3735, "rewards/accuracies": 0.765625, "rewards/chosen": 1.111328125, "rewards/margins": 1.92578125, "rewards/rejected": -0.81640625, "step": 231 }, { "epoch": 0.25663716814159293, "grad_norm": 13.046867370605469, "learning_rate": 4.366237555860256e-07, "logits/chosen": -1.35546875, "logits/rejected": -1.2109375, "logps/chosen": -246.0, "logps/rejected": -270.0, "loss": 0.3317, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.16015625, "rewards/margins": 2.109375, "rewards/rejected": -0.9453125, "step": 232 }, { "epoch": 0.2577433628318584, "grad_norm": 15.247108459472656, "learning_rate": 4.3602598496784013e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.140625, "logps/chosen": -272.0, "logps/rejected": -268.0, "loss": 0.3798, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.9765625, "rewards/margins": 1.875, "rewards/rejected": -0.896484375, "step": 233 }, { "epoch": 0.2588495575221239, "grad_norm": 13.2136812210083, "learning_rate": 4.3542582177950373e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.1484375, "logps/chosen": -227.5, "logps/rejected": -262.5, "loss": 0.3171, "rewards/accuracies": 0.828125, "rewards/chosen": 1.26953125, "rewards/margins": 2.1015625, "rewards/rejected": -0.822265625, "step": 234 }, { "epoch": 0.25995575221238937, "grad_norm": 13.574021339416504, "learning_rate": 4.348232737400054e-07, "logits/chosen": -1.13671875, "logits/rejected": -1.171875, "logps/chosen": -239.0, "logps/rejected": -267.0, "loss": 0.3749, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.05859375, "rewards/margins": 1.78125, "rewards/rejected": -0.720703125, "step": 235 }, { "epoch": 0.2610619469026549, "grad_norm": 13.393758773803711, "learning_rate": 4.3421834859900685e-07, "logits/chosen": -1.265625, "logits/rejected": -1.140625, "logps/chosen": -236.5, "logps/rejected": -255.0, "loss": 0.3454, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1796875, "rewards/margins": 2.140625, "rewards/rejected": -0.962890625, "step": 236 }, { "epoch": 0.26216814159292035, "grad_norm": 17.910938262939453, "learning_rate": 4.336110541367428e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.19921875, "logps/chosen": -245.5, "logps/rejected": -272.0, "loss": 0.4424, "rewards/accuracies": 0.6875, "rewards/chosen": 0.943359375, "rewards/margins": 1.58984375, "rewards/rejected": -0.646484375, "step": 237 }, { "epoch": 0.26327433628318586, "grad_norm": 14.35909366607666, "learning_rate": 4.33001398163921e-07, "logits/chosen": -1.25, "logits/rejected": -1.2265625, "logps/chosen": -243.5, "logps/rejected": -260.5, "loss": 0.3525, "rewards/accuracies": 0.78125, "rewards/chosen": 1.30078125, "rewards/margins": 2.2109375, "rewards/rejected": -0.91015625, "step": 238 }, { "epoch": 0.2643805309734513, "grad_norm": 15.5848970413208, "learning_rate": 4.3238938852162187e-07, "logits/chosen": -1.1875, "logits/rejected": -1.3046875, "logps/chosen": -250.5, "logps/rejected": -273.0, "loss": 0.3839, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.1640625, "rewards/margins": 2.03125, "rewards/rejected": -0.861328125, "step": 239 }, { "epoch": 0.26548672566371684, "grad_norm": 13.962175369262695, "learning_rate": 4.317750330811972e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.328125, "logps/chosen": -250.5, "logps/rejected": -275.0, "loss": 0.3394, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.1875, "rewards/margins": 1.91015625, "rewards/rejected": -0.72265625, "step": 240 }, { "epoch": 0.2665929203539823, "grad_norm": 13.130892753601074, "learning_rate": 4.311583397441696e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.26171875, "logps/chosen": -240.0, "logps/rejected": -255.5, "loss": 0.3364, "rewards/accuracies": 0.765625, "rewards/chosen": 1.25, "rewards/margins": 2.203125, "rewards/rejected": -0.9453125, "step": 241 }, { "epoch": 0.2676991150442478, "grad_norm": 15.227952003479004, "learning_rate": 4.3053931644213e-07, "logits/chosen": -1.23046875, "logits/rejected": -1.1640625, "logps/chosen": -261.0, "logps/rejected": -269.5, "loss": 0.4343, "rewards/accuracies": 0.75, "rewards/chosen": 1.15234375, "rewards/margins": 1.65234375, "rewards/rejected": -0.4970703125, "step": 242 }, { "epoch": 0.2688053097345133, "grad_norm": 11.86292552947998, "learning_rate": 4.2991797113663676e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.1875, "logps/chosen": -239.5, "logps/rejected": -268.0, "loss": 0.2865, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.3125, "rewards/margins": 2.296875, "rewards/rejected": -0.98046875, "step": 243 }, { "epoch": 0.26991150442477874, "grad_norm": 12.915170669555664, "learning_rate": 4.292943118191121e-07, "logits/chosen": -1.19140625, "logits/rejected": -1.23828125, "logps/chosen": -243.0, "logps/rejected": -257.0, "loss": 0.3192, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.26953125, "rewards/margins": 2.1328125, "rewards/rejected": -0.869140625, "step": 244 }, { "epoch": 0.27101769911504425, "grad_norm": 16.35938262939453, "learning_rate": 4.2866834651074024e-07, "logits/chosen": -1.16015625, "logits/rejected": -1.12890625, "logps/chosen": -283.0, "logps/rejected": -308.0, "loss": 0.3896, "rewards/accuracies": 0.734375, "rewards/chosen": 1.19921875, "rewards/margins": 1.8046875, "rewards/rejected": -0.607421875, "step": 245 }, { "epoch": 0.2721238938053097, "grad_norm": 14.654645919799805, "learning_rate": 4.280400832623636e-07, "logits/chosen": -1.25, "logits/rejected": -1.10546875, "logps/chosen": -269.5, "logps/rejected": -273.0, "loss": 0.3785, "rewards/accuracies": 0.7265625, "rewards/chosen": 1.234375, "rewards/margins": 1.95703125, "rewards/rejected": -0.71484375, "step": 246 }, { "epoch": 0.27323008849557523, "grad_norm": 12.577658653259277, "learning_rate": 4.274095301543796e-07, "logits/chosen": -1.4140625, "logits/rejected": -1.234375, "logps/chosen": -222.5, "logps/rejected": -252.0, "loss": 0.3402, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.33984375, "rewards/margins": 2.06640625, "rewards/rejected": -0.73046875, "step": 247 }, { "epoch": 0.2743362831858407, "grad_norm": 13.634322166442871, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.1640625, "logps/chosen": -266.0, "logps/rejected": -267.5, "loss": 0.3221, "rewards/accuracies": 0.796875, "rewards/chosen": 1.41015625, "rewards/margins": 2.2734375, "rewards/rejected": -0.87109375, "step": 248 }, { "epoch": 0.2754424778761062, "grad_norm": 14.120111465454102, "learning_rate": 4.2614158682833037e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.1328125, "logps/chosen": -251.0, "logps/rejected": -281.0, "loss": 0.3739, "rewards/accuracies": 0.765625, "rewards/chosen": 1.32421875, "rewards/margins": 1.90625, "rewards/rejected": -0.5791015625, "step": 249 }, { "epoch": 0.27654867256637167, "grad_norm": 14.189047813415527, "learning_rate": 4.255042129178973e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.21484375, "logps/chosen": -237.0, "logps/rejected": -268.0, "loss": 0.3868, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.14453125, "rewards/margins": 1.9375, "rewards/rejected": -0.794921875, "step": 250 }, { "epoch": 0.27654867256637167, "eval_logits/chosen": -1.2774020433425903, "eval_logits/rejected": -1.207769751548767, "eval_logps/chosen": -248.95523071289062, "eval_logps/rejected": -267.1990051269531, "eval_loss": 0.36062541604042053, "eval_rewards/accuracies": 0.7771241068840027, "eval_rewards/chosen": 1.1926889419555664, "eval_rewards/margins": 1.988767147064209, "eval_rewards/rejected": -0.7957963943481445, "eval_runtime": 193.0793, "eval_samples_per_second": 66.568, "eval_steps_per_second": 1.041, "step": 250 }, { "epoch": 0.2776548672566372, "grad_norm": 14.157464981079102, "learning_rate": 4.248645817629117e-07, "logits/chosen": -1.40625, "logits/rejected": -1.26171875, "logps/chosen": -262.0, "logps/rejected": -279.0, "loss": 0.3588, "rewards/accuracies": 0.796875, "rewards/chosen": 1.033203125, "rewards/margins": 1.91796875, "rewards/rejected": -0.88671875, "step": 251 }, { "epoch": 0.27876106194690264, "grad_norm": 12.822221755981445, "learning_rate": 4.242227015899793e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.19140625, "logps/chosen": -245.5, "logps/rejected": -273.0, "loss": 0.3323, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.41796875, "rewards/margins": 2.2734375, "rewards/rejected": -0.853515625, "step": 252 }, { "epoch": 0.27986725663716816, "grad_norm": 15.107699394226074, "learning_rate": 4.2357858065463124e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.13671875, "logps/chosen": -243.5, "logps/rejected": -275.0, "loss": 0.4063, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.2421875, "rewards/margins": 1.88671875, "rewards/rejected": -0.642578125, "step": 253 }, { "epoch": 0.2809734513274336, "grad_norm": 14.644704818725586, "learning_rate": 4.229322272412185e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.203125, "logps/chosen": -274.0, "logps/rejected": -289.0, "loss": 0.3511, "rewards/accuracies": 0.78125, "rewards/chosen": 1.009765625, "rewards/margins": 1.98046875, "rewards/rejected": -0.970703125, "step": 254 }, { "epoch": 0.28207964601769914, "grad_norm": 14.453044891357422, "learning_rate": 4.222836496628047e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.21875, "logps/chosen": -264.0, "logps/rejected": -286.0, "loss": 0.3342, "rewards/accuracies": 0.828125, "rewards/chosen": 1.21484375, "rewards/margins": 1.9375, "rewards/rejected": -0.72265625, "step": 255 }, { "epoch": 0.2831858407079646, "grad_norm": 12.731569290161133, "learning_rate": 4.216328562610599e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.2109375, "logps/chosen": -231.5, "logps/rejected": -262.5, "loss": 0.3542, "rewards/accuracies": 0.78125, "rewards/chosen": 1.06640625, "rewards/margins": 2.12890625, "rewards/rejected": -1.064453125, "step": 256 }, { "epoch": 0.2842920353982301, "grad_norm": 14.310720443725586, "learning_rate": 4.209798554061527e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.21484375, "logps/chosen": -258.0, "logps/rejected": -282.0, "loss": 0.3884, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.966796875, "rewards/margins": 1.828125, "rewards/rejected": -0.861328125, "step": 257 }, { "epoch": 0.2853982300884956, "grad_norm": 14.716500282287598, "learning_rate": 4.203246554966428e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.3046875, "logps/chosen": -243.0, "logps/rejected": -253.0, "loss": 0.4139, "rewards/accuracies": 0.71875, "rewards/chosen": 0.962890625, "rewards/margins": 1.71484375, "rewards/rejected": -0.75, "step": 258 }, { "epoch": 0.28650442477876104, "grad_norm": 14.436864852905273, "learning_rate": 4.1966726495937305e-07, "logits/chosen": -1.42578125, "logits/rejected": -1.20703125, "logps/chosen": -252.5, "logps/rejected": -273.0, "loss": 0.3439, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.08984375, "rewards/margins": 1.95703125, "rewards/rejected": -0.86328125, "step": 259 }, { "epoch": 0.28761061946902655, "grad_norm": 15.182847023010254, "learning_rate": 4.1900769224936124e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.19140625, "logps/chosen": -286.0, "logps/rejected": -310.0, "loss": 0.3774, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.96875, "rewards/margins": 1.99609375, "rewards/rejected": -1.03125, "step": 260 }, { "epoch": 0.288716814159292, "grad_norm": 13.360356330871582, "learning_rate": 4.1834594584969077e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.2109375, "logps/chosen": -248.5, "logps/rejected": -266.0, "loss": 0.3638, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.115234375, "rewards/margins": 1.9140625, "rewards/rejected": -0.802734375, "step": 261 }, { "epoch": 0.28982300884955753, "grad_norm": 13.982027053833008, "learning_rate": 4.176820342714022e-07, "logits/chosen": -1.39453125, "logits/rejected": -1.26171875, "logps/chosen": -259.0, "logps/rejected": -281.0, "loss": 0.3449, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.068359375, "rewards/margins": 1.99609375, "rewards/rejected": -0.921875, "step": 262 }, { "epoch": 0.290929203539823, "grad_norm": 13.159867286682129, "learning_rate": 4.1701596605338334e-07, "logits/chosen": -1.40234375, "logits/rejected": -1.234375, "logps/chosen": -242.5, "logps/rejected": -271.0, "loss": 0.3395, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.25, "rewards/margins": 1.99609375, "rewards/rejected": -0.751953125, "step": 263 }, { "epoch": 0.2920353982300885, "grad_norm": 12.9893798828125, "learning_rate": 4.1634774976225965e-07, "logits/chosen": -1.35546875, "logits/rejected": -1.2109375, "logps/chosen": -234.5, "logps/rejected": -277.0, "loss": 0.3156, "rewards/accuracies": 0.8125, "rewards/chosen": 1.171875, "rewards/margins": 2.2421875, "rewards/rejected": -1.0703125, "step": 264 }, { "epoch": 0.29314159292035397, "grad_norm": 13.78116226196289, "learning_rate": 4.15677393992284e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.2578125, "logps/chosen": -253.5, "logps/rejected": -279.0, "loss": 0.3418, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.20703125, "rewards/margins": 2.171875, "rewards/rejected": -0.966796875, "step": 265 }, { "epoch": 0.2942477876106195, "grad_norm": 14.935332298278809, "learning_rate": 4.150049073652261e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.15625, "logps/chosen": -265.0, "logps/rejected": -291.0, "loss": 0.3503, "rewards/accuracies": 0.765625, "rewards/chosen": 1.21875, "rewards/margins": 2.1640625, "rewards/rejected": -0.94140625, "step": 266 }, { "epoch": 0.29535398230088494, "grad_norm": 15.937322616577148, "learning_rate": 4.1433029853026163e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.203125, "logps/chosen": -245.5, "logps/rejected": -294.0, "loss": 0.3923, "rewards/accuracies": 0.796875, "rewards/chosen": 1.0625, "rewards/margins": 1.91796875, "rewards/rejected": -0.85546875, "step": 267 }, { "epoch": 0.29646017699115046, "grad_norm": 14.759867668151855, "learning_rate": 4.136535761638611e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.09375, "logps/chosen": -276.0, "logps/rejected": -295.0, "loss": 0.356, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.98828125, "rewards/margins": 1.98828125, "rewards/rejected": -1.001953125, "step": 268 }, { "epoch": 0.2975663716814159, "grad_norm": 13.723134994506836, "learning_rate": 4.129747489696781e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.12890625, "logps/chosen": -252.5, "logps/rejected": -246.5, "loss": 0.3215, "rewards/accuracies": 0.78125, "rewards/chosen": 1.06640625, "rewards/margins": 2.3671875, "rewards/rejected": -1.29296875, "step": 269 }, { "epoch": 0.29867256637168144, "grad_norm": 12.263731002807617, "learning_rate": 4.122938256784374e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.30859375, "logps/chosen": -216.5, "logps/rejected": -275.0, "loss": 0.3189, "rewards/accuracies": 0.8125, "rewards/chosen": 1.16796875, "rewards/margins": 2.265625, "rewards/rejected": -1.09375, "step": 270 }, { "epoch": 0.2997787610619469, "grad_norm": 15.063496589660645, "learning_rate": 4.116108150478228e-07, "logits/chosen": -1.1875, "logits/rejected": -1.125, "logps/chosen": -255.5, "logps/rejected": -256.5, "loss": 0.3799, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.818359375, "rewards/margins": 1.7890625, "rewards/rejected": -0.974609375, "step": 271 }, { "epoch": 0.3008849557522124, "grad_norm": 15.24313735961914, "learning_rate": 4.109257258623643e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.09765625, "logps/chosen": -238.5, "logps/rejected": -274.0, "loss": 0.3779, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.15625, "rewards/margins": 2.1875, "rewards/rejected": -1.03125, "step": 272 }, { "epoch": 0.3019911504424779, "grad_norm": 14.098630905151367, "learning_rate": 4.1023856693332516e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.17578125, "logps/chosen": -248.0, "logps/rejected": -272.0, "loss": 0.3197, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.982421875, "rewards/margins": 2.25, "rewards/rejected": -1.26953125, "step": 273 }, { "epoch": 0.3030973451327434, "grad_norm": 13.230525970458984, "learning_rate": 4.0954934709858857e-07, "logits/chosen": -1.234375, "logits/rejected": -1.16796875, "logps/chosen": -268.0, "logps/rejected": -287.0, "loss": 0.3215, "rewards/accuracies": 0.828125, "rewards/chosen": 1.15234375, "rewards/margins": 2.125, "rewards/rejected": -0.97265625, "step": 274 }, { "epoch": 0.30420353982300885, "grad_norm": 12.722634315490723, "learning_rate": 4.0885807522254433e-07, "logits/chosen": -1.375, "logits/rejected": -1.25390625, "logps/chosen": -256.5, "logps/rejected": -319.0, "loss": 0.3175, "rewards/accuracies": 0.8125, "rewards/chosen": 1.37890625, "rewards/margins": 2.3515625, "rewards/rejected": -0.96875, "step": 275 }, { "epoch": 0.3053097345132743, "grad_norm": 12.688482284545898, "learning_rate": 4.0816476019597423e-07, "logits/chosen": -1.41015625, "logits/rejected": -1.2890625, "logps/chosen": -235.5, "logps/rejected": -256.0, "loss": 0.3222, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.28515625, "rewards/margins": 2.20703125, "rewards/rejected": -0.92578125, "step": 276 }, { "epoch": 0.3064159292035398, "grad_norm": 14.044715881347656, "learning_rate": 4.0746941093593807e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.27734375, "logps/chosen": -249.0, "logps/rejected": -295.0, "loss": 0.2954, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.359375, "rewards/margins": 2.3671875, "rewards/rejected": -1.009765625, "step": 277 }, { "epoch": 0.3075221238938053, "grad_norm": 15.867609024047852, "learning_rate": 4.0677203638565893e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.25390625, "logps/chosen": -260.0, "logps/rejected": -275.0, "loss": 0.3278, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.19921875, "rewards/margins": 2.296875, "rewards/rejected": -1.10546875, "step": 278 }, { "epoch": 0.3086283185840708, "grad_norm": 16.124387741088867, "learning_rate": 4.060726455144082e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.16015625, "logps/chosen": -240.5, "logps/rejected": -281.0, "loss": 0.3936, "rewards/accuracies": 0.71875, "rewards/chosen": 1.025390625, "rewards/margins": 1.96484375, "rewards/rejected": -0.9375, "step": 279 }, { "epoch": 0.30973451327433627, "grad_norm": 14.164496421813965, "learning_rate": 4.0537124731739003e-07, "logits/chosen": -1.34765625, "logits/rejected": -1.1953125, "logps/chosen": -250.0, "logps/rejected": -270.0, "loss": 0.3594, "rewards/accuracies": 0.796875, "rewards/chosen": 1.005859375, "rewards/margins": 2.02734375, "rewards/rejected": -1.02734375, "step": 280 }, { "epoch": 0.3108407079646018, "grad_norm": 14.754056930541992, "learning_rate": 4.0466785081562583e-07, "logits/chosen": -1.234375, "logits/rejected": -1.2265625, "logps/chosen": -258.5, "logps/rejected": -247.0, "loss": 0.3625, "rewards/accuracies": 0.8125, "rewards/chosen": 1.095703125, "rewards/margins": 2.06640625, "rewards/rejected": -0.970703125, "step": 281 }, { "epoch": 0.31194690265486724, "grad_norm": 14.682291030883789, "learning_rate": 4.039624650558382e-07, "logits/chosen": -1.13671875, "logits/rejected": -1.21484375, "logps/chosen": -239.0, "logps/rejected": -265.5, "loss": 0.3439, "rewards/accuracies": 0.765625, "rewards/chosen": 1.28125, "rewards/margins": 2.2890625, "rewards/rejected": -1.009765625, "step": 282 }, { "epoch": 0.31305309734513276, "grad_norm": 13.215510368347168, "learning_rate": 4.032550991103344e-07, "logits/chosen": -1.3984375, "logits/rejected": -1.30859375, "logps/chosen": -218.5, "logps/rejected": -263.5, "loss": 0.3302, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.0078125, "rewards/margins": 2.03515625, "rewards/rejected": -1.029296875, "step": 283 }, { "epoch": 0.3141592920353982, "grad_norm": 14.175792694091797, "learning_rate": 4.0254576207689004e-07, "logits/chosen": -1.23046875, "logits/rejected": -1.21484375, "logps/chosen": -268.0, "logps/rejected": -307.0, "loss": 0.3466, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.083984375, "rewards/margins": 2.0703125, "rewards/rejected": -0.986328125, "step": 284 }, { "epoch": 0.31526548672566373, "grad_norm": 15.70940113067627, "learning_rate": 4.0183446307863174e-07, "logits/chosen": -1.359375, "logits/rejected": -1.20703125, "logps/chosen": -249.0, "logps/rejected": -281.0, "loss": 0.3759, "rewards/accuracies": 0.765625, "rewards/chosen": 0.927734375, "rewards/margins": 1.94921875, "rewards/rejected": -1.015625, "step": 285 }, { "epoch": 0.3163716814159292, "grad_norm": 14.439340591430664, "learning_rate": 4.0112121126391967e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.26171875, "logps/chosen": -278.0, "logps/rejected": -298.0, "loss": 0.3487, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.2421875, "rewards/margins": 2.3515625, "rewards/rejected": -1.109375, "step": 286 }, { "epoch": 0.3174778761061947, "grad_norm": 13.696681022644043, "learning_rate": 4.0040601580623054e-07, "logits/chosen": -1.3125, "logits/rejected": -1.3515625, "logps/chosen": -236.0, "logps/rejected": -246.0, "loss": 0.3344, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.181640625, "rewards/margins": 2.3515625, "rewards/rejected": -1.16796875, "step": 287 }, { "epoch": 0.3185840707964602, "grad_norm": 13.659770011901855, "learning_rate": 3.9968888590403904e-07, "logits/chosen": -1.28125, "logits/rejected": -1.3828125, "logps/chosen": -248.5, "logps/rejected": -280.0, "loss": 0.3278, "rewards/accuracies": 0.796875, "rewards/chosen": 1.33203125, "rewards/margins": 2.375, "rewards/rejected": -1.046875, "step": 288 }, { "epoch": 0.3196902654867257, "grad_norm": 12.041626930236816, "learning_rate": 3.9896983078069947e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.16796875, "logps/chosen": -245.0, "logps/rejected": -273.5, "loss": 0.3141, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.22265625, "rewards/margins": 2.296875, "rewards/rejected": -1.078125, "step": 289 }, { "epoch": 0.32079646017699115, "grad_norm": 14.61534595489502, "learning_rate": 3.9824885968432755e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.1875, "logps/chosen": -241.5, "logps/rejected": -251.0, "loss": 0.3742, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.05859375, "rewards/margins": 2.0859375, "rewards/rejected": -1.025390625, "step": 290 }, { "epoch": 0.3219026548672566, "grad_norm": 13.926555633544922, "learning_rate": 3.975259818876811e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.2578125, "logps/chosen": -262.0, "logps/rejected": -259.0, "loss": 0.298, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2265625, "rewards/margins": 2.328125, "rewards/rejected": -1.1015625, "step": 291 }, { "epoch": 0.3230088495575221, "grad_norm": 12.315802574157715, "learning_rate": 3.968012066880412e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.2265625, "logps/chosen": -259.0, "logps/rejected": -267.0, "loss": 0.3022, "rewards/accuracies": 0.796875, "rewards/chosen": 1.2109375, "rewards/margins": 2.515625, "rewards/rejected": -1.30078125, "step": 292 }, { "epoch": 0.3241150442477876, "grad_norm": 12.437846183776855, "learning_rate": 3.960745434070921e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.06640625, "logps/chosen": -256.5, "logps/rejected": -281.0, "loss": 0.3422, "rewards/accuracies": 0.796875, "rewards/chosen": 0.744140625, "rewards/margins": 1.89453125, "rewards/rejected": -1.1484375, "step": 293 }, { "epoch": 0.3252212389380531, "grad_norm": 13.978434562683105, "learning_rate": 3.9534600139080163e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.140625, "logps/chosen": -237.0, "logps/rejected": -274.0, "loss": 0.366, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.888671875, "rewards/margins": 2.2109375, "rewards/rejected": -1.32421875, "step": 294 }, { "epoch": 0.32632743362831856, "grad_norm": 17.65538787841797, "learning_rate": 3.94615590009301e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.25390625, "logps/chosen": -264.0, "logps/rejected": -285.0, "loss": 0.4392, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.931640625, "rewards/margins": 1.875, "rewards/rejected": -0.947265625, "step": 295 }, { "epoch": 0.3274336283185841, "grad_norm": 12.776206016540527, "learning_rate": 3.9388331865676425e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.234375, "logps/chosen": -246.5, "logps/rejected": -260.0, "loss": 0.2823, "rewards/accuracies": 0.875, "rewards/chosen": 1.0234375, "rewards/margins": 2.453125, "rewards/rejected": -1.42578125, "step": 296 }, { "epoch": 0.32853982300884954, "grad_norm": 15.579447746276855, "learning_rate": 3.931491967512872e-07, "logits/chosen": -1.3984375, "logits/rejected": -1.2734375, "logps/chosen": -252.5, "logps/rejected": -283.0, "loss": 0.3896, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.94140625, "rewards/margins": 1.859375, "rewards/rejected": -0.91796875, "step": 297 }, { "epoch": 0.32964601769911506, "grad_norm": 13.556859970092773, "learning_rate": 3.9241323373476686e-07, "logits/chosen": -1.16015625, "logits/rejected": -1.125, "logps/chosen": -258.0, "logps/rejected": -265.0, "loss": 0.3322, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.1015625, "rewards/margins": 2.296875, "rewards/rejected": -1.1953125, "step": 298 }, { "epoch": 0.3307522123893805, "grad_norm": 12.573343276977539, "learning_rate": 3.916754390727794e-07, "logits/chosen": -1.203125, "logits/rejected": -1.15234375, "logps/chosen": -251.0, "logps/rejected": -285.0, "loss": 0.2524, "rewards/accuracies": 0.875, "rewards/chosen": 1.375, "rewards/margins": 2.71875, "rewards/rejected": -1.34765625, "step": 299 }, { "epoch": 0.33185840707964603, "grad_norm": 13.832542419433594, "learning_rate": 3.9093582225445877e-07, "logits/chosen": -1.28125, "logits/rejected": -1.1953125, "logps/chosen": -263.0, "logps/rejected": -283.5, "loss": 0.3695, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.689453125, "rewards/margins": 1.85546875, "rewards/rejected": -1.1640625, "step": 300 }, { "epoch": 0.33185840707964603, "eval_logits/chosen": -1.2725435495376587, "eval_logits/rejected": -1.1930581331253052, "eval_logps/chosen": -251.15921020507812, "eval_logps/rejected": -271.43780517578125, "eval_loss": 0.35059425234794617, "eval_rewards/accuracies": 0.7869349718093872, "eval_rewards/chosen": 0.9655628204345703, "eval_rewards/margins": 2.1816697120666504, "eval_rewards/rejected": -1.2169232368469238, "eval_runtime": 193.0334, "eval_samples_per_second": 66.584, "eval_steps_per_second": 1.041, "step": 300 }, { "epoch": 0.3329646017699115, "grad_norm": 13.506171226501465, "learning_rate": 3.901943927923744e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.15234375, "logps/chosen": -258.5, "logps/rejected": -279.0, "loss": 0.3567, "rewards/accuracies": 0.8125, "rewards/chosen": 0.908203125, "rewards/margins": 2.03125, "rewards/rejected": -1.119140625, "step": 301 }, { "epoch": 0.334070796460177, "grad_norm": 14.112154960632324, "learning_rate": 3.8945116022240937e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.08203125, "logps/chosen": -268.0, "logps/rejected": -313.0, "loss": 0.3424, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.9765625, "rewards/margins": 2.2109375, "rewards/rejected": -1.23046875, "step": 302 }, { "epoch": 0.33517699115044247, "grad_norm": 13.437678337097168, "learning_rate": 3.8870613410363707e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.18359375, "logps/chosen": -269.0, "logps/rejected": -273.5, "loss": 0.361, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.830078125, "rewards/margins": 1.97265625, "rewards/rejected": -1.14453125, "step": 303 }, { "epoch": 0.336283185840708, "grad_norm": 14.396577835083008, "learning_rate": 3.8795932401819863e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.1171875, "logps/chosen": -272.0, "logps/rejected": -286.0, "loss": 0.3308, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.861328125, "rewards/margins": 2.1796875, "rewards/rejected": -1.31640625, "step": 304 }, { "epoch": 0.33738938053097345, "grad_norm": 15.585335731506348, "learning_rate": 3.872107395711798e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.08984375, "logps/chosen": -280.0, "logps/rejected": -337.0, "loss": 0.369, "rewards/accuracies": 0.75, "rewards/chosen": 0.9921875, "rewards/margins": 1.98046875, "rewards/rejected": -0.984375, "step": 305 }, { "epoch": 0.33849557522123896, "grad_norm": 16.997081756591797, "learning_rate": 3.864603903904871e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.171875, "logps/chosen": -283.0, "logps/rejected": -291.0, "loss": 0.3989, "rewards/accuracies": 0.734375, "rewards/chosen": 0.982421875, "rewards/margins": 2.2421875, "rewards/rejected": -1.2578125, "step": 306 }, { "epoch": 0.3396017699115044, "grad_norm": 16.18097496032715, "learning_rate": 3.857082861267242e-07, "logits/chosen": -1.265625, "logits/rejected": -1.23828125, "logps/chosen": -249.0, "logps/rejected": -266.0, "loss": 0.402, "rewards/accuracies": 0.765625, "rewards/chosen": 0.83984375, "rewards/margins": 1.9921875, "rewards/rejected": -1.15234375, "step": 307 }, { "epoch": 0.3407079646017699, "grad_norm": 13.513619422912598, "learning_rate": 3.849544364530677e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.2421875, "logps/chosen": -264.5, "logps/rejected": -273.0, "loss": 0.2981, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.98046875, "rewards/margins": 2.4921875, "rewards/rejected": -1.515625, "step": 308 }, { "epoch": 0.3418141592920354, "grad_norm": 15.130499839782715, "learning_rate": 3.8419885106514295e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.18359375, "logps/chosen": -271.5, "logps/rejected": -284.0, "loss": 0.3542, "rewards/accuracies": 0.796875, "rewards/chosen": 0.962890625, "rewards/margins": 2.1484375, "rewards/rejected": -1.1875, "step": 309 }, { "epoch": 0.34292035398230086, "grad_norm": 18.18540382385254, "learning_rate": 3.834415396808988e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.1640625, "logps/chosen": -251.0, "logps/rejected": -289.0, "loss": 0.3976, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.78515625, "rewards/margins": 2.03515625, "rewards/rejected": -1.24609375, "step": 310 }, { "epoch": 0.3440265486725664, "grad_norm": 13.235279083251953, "learning_rate": 3.826825120404833e-07, "logits/chosen": -1.14453125, "logits/rejected": -1.18359375, "logps/chosen": -265.0, "logps/rejected": -271.0, "loss": 0.3018, "rewards/accuracies": 0.828125, "rewards/chosen": 0.990234375, "rewards/margins": 2.40625, "rewards/rejected": -1.4140625, "step": 311 }, { "epoch": 0.34513274336283184, "grad_norm": 14.219352722167969, "learning_rate": 3.81921777906118e-07, "logits/chosen": -1.34375, "logits/rejected": -1.16796875, "logps/chosen": -243.0, "logps/rejected": -265.0, "loss": 0.3433, "rewards/accuracies": 0.78125, "rewards/chosen": 1.10546875, "rewards/margins": 2.3203125, "rewards/rejected": -1.2109375, "step": 312 }, { "epoch": 0.34623893805309736, "grad_norm": 14.085458755493164, "learning_rate": 3.8115934706197244e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.234375, "logps/chosen": -263.0, "logps/rejected": -260.0, "loss": 0.3526, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.12890625, "rewards/margins": 2.17578125, "rewards/rejected": -1.046875, "step": 313 }, { "epoch": 0.3473451327433628, "grad_norm": 13.884740829467773, "learning_rate": 3.8039522931403847e-07, "logits/chosen": -1.4375, "logits/rejected": -1.22265625, "logps/chosen": -257.0, "logps/rejected": -274.5, "loss": 0.3197, "rewards/accuracies": 0.828125, "rewards/chosen": 1.1328125, "rewards/margins": 2.29296875, "rewards/rejected": -1.15625, "step": 314 }, { "epoch": 0.34845132743362833, "grad_norm": 15.969679832458496, "learning_rate": 3.7962943449000377e-07, "logits/chosen": -1.1640625, "logits/rejected": -1.1484375, "logps/chosen": -260.0, "logps/rejected": -283.0, "loss": 0.4191, "rewards/accuracies": 0.71875, "rewards/chosen": 0.90625, "rewards/margins": 1.7890625, "rewards/rejected": -0.880859375, "step": 315 }, { "epoch": 0.3495575221238938, "grad_norm": 14.207815170288086, "learning_rate": 3.7886197243912607e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.1015625, "logps/chosen": -256.0, "logps/rejected": -279.0, "loss": 0.3409, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.19140625, "rewards/margins": 2.34375, "rewards/rejected": -1.15625, "step": 316 }, { "epoch": 0.3506637168141593, "grad_norm": 14.867213249206543, "learning_rate": 3.7809285303210593e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.09375, "logps/chosen": -248.5, "logps/rejected": -246.0, "loss": 0.3668, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.06640625, "rewards/margins": 2.0, "rewards/rejected": -0.93359375, "step": 317 }, { "epoch": 0.35176991150442477, "grad_norm": 13.388772964477539, "learning_rate": 3.7732208616095986e-07, "logits/chosen": -1.12890625, "logits/rejected": -1.1484375, "logps/chosen": -249.0, "logps/rejected": -276.0, "loss": 0.3055, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.44140625, "rewards/margins": 2.578125, "rewards/rejected": -1.13671875, "step": 318 }, { "epoch": 0.3528761061946903, "grad_norm": 13.385259628295898, "learning_rate": 3.7654968173889334e-07, "logits/chosen": -1.34375, "logits/rejected": -1.14453125, "logps/chosen": -240.5, "logps/rejected": -279.0, "loss": 0.3375, "rewards/accuracies": 0.78125, "rewards/chosen": 1.28125, "rewards/margins": 2.4765625, "rewards/rejected": -1.19140625, "step": 319 }, { "epoch": 0.35398230088495575, "grad_norm": 13.495279312133789, "learning_rate": 3.7577564970017336e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.12890625, "logps/chosen": -237.5, "logps/rejected": -251.0, "loss": 0.3125, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.16796875, "rewards/margins": 2.546875, "rewards/rejected": -1.37890625, "step": 320 }, { "epoch": 0.35508849557522126, "grad_norm": 14.151552200317383, "learning_rate": 3.75e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.1171875, "logps/chosen": -251.0, "logps/rejected": -281.0, "loss": 0.3316, "rewards/accuracies": 0.765625, "rewards/chosen": 1.12109375, "rewards/margins": 2.3125, "rewards/rejected": -1.19140625, "step": 321 }, { "epoch": 0.3561946902654867, "grad_norm": 13.408705711364746, "learning_rate": 3.742227426143793e-07, "logits/chosen": -1.23046875, "logits/rejected": -1.12890625, "logps/chosen": -229.5, "logps/rejected": -235.0, "loss": 0.3559, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.94140625, "rewards/margins": 2.0625, "rewards/rejected": -1.1171875, "step": 322 }, { "epoch": 0.3573008849557522, "grad_norm": 14.06219482421875, "learning_rate": 3.734438875399943e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.22265625, "logps/chosen": -270.0, "logps/rejected": -296.0, "loss": 0.3082, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.3046875, "rewards/margins": 2.5, "rewards/rejected": -1.19921875, "step": 323 }, { "epoch": 0.3584070796460177, "grad_norm": 13.949469566345215, "learning_rate": 3.726634447940768e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.17578125, "logps/chosen": -277.0, "logps/rejected": -297.0, "loss": 0.3666, "rewards/accuracies": 0.78125, "rewards/chosen": 1.04296875, "rewards/margins": 1.87109375, "rewards/rejected": -0.83203125, "step": 324 }, { "epoch": 0.35951327433628316, "grad_norm": 14.00977897644043, "learning_rate": 3.7188142441427836e-07, "logits/chosen": -1.1484375, "logits/rejected": -1.1640625, "logps/chosen": -237.5, "logps/rejected": -263.0, "loss": 0.3086, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.328125, "rewards/margins": 2.53125, "rewards/rejected": -1.203125, "step": 325 }, { "epoch": 0.3606194690265487, "grad_norm": 17.24125099182129, "learning_rate": 3.710978364585411e-07, "logits/chosen": -1.25, "logits/rejected": -1.23046875, "logps/chosen": -265.5, "logps/rejected": -274.0, "loss": 0.4063, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.005859375, "rewards/margins": 2.12890625, "rewards/rejected": -1.126953125, "step": 326 }, { "epoch": 0.36172566371681414, "grad_norm": 13.335806846618652, "learning_rate": 3.7031269100496897e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.1328125, "logps/chosen": -246.0, "logps/rejected": -255.5, "loss": 0.3012, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.142578125, "rewards/margins": 2.578125, "rewards/rejected": -1.42578125, "step": 327 }, { "epoch": 0.36283185840707965, "grad_norm": 14.329955101013184, "learning_rate": 3.69525998151697e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.140625, "logps/chosen": -260.0, "logps/rejected": -284.0, "loss": 0.3322, "rewards/accuracies": 0.75, "rewards/chosen": 1.23046875, "rewards/margins": 2.453125, "rewards/rejected": -1.22265625, "step": 328 }, { "epoch": 0.3639380530973451, "grad_norm": 14.133960723876953, "learning_rate": 3.687377680167626e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.125, "logps/chosen": -253.0, "logps/rejected": -271.0, "loss": 0.3381, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.23046875, "rewards/margins": 2.5390625, "rewards/rejected": -1.3125, "step": 329 }, { "epoch": 0.36504424778761063, "grad_norm": 15.067541122436523, "learning_rate": 3.6794801073797453e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.15234375, "logps/chosen": -262.0, "logps/rejected": -284.0, "loss": 0.3784, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.06640625, "rewards/margins": 2.1640625, "rewards/rejected": -1.09375, "step": 330 }, { "epoch": 0.3661504424778761, "grad_norm": 14.064321517944336, "learning_rate": 3.671567364727833e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.140625, "logps/chosen": -234.5, "logps/rejected": -260.0, "loss": 0.3866, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.0234375, "rewards/margins": 2.15625, "rewards/rejected": -1.12890625, "step": 331 }, { "epoch": 0.3672566371681416, "grad_norm": 14.258491516113281, "learning_rate": 3.663639553981497e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.2421875, "logps/chosen": -236.0, "logps/rejected": -253.5, "loss": 0.3042, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4140625, "rewards/margins": 2.6640625, "rewards/rejected": -1.25, "step": 332 }, { "epoch": 0.36836283185840707, "grad_norm": 14.841512680053711, "learning_rate": 3.655696777104146e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.21875, "logps/chosen": -251.5, "logps/rejected": -276.0, "loss": 0.338, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.28515625, "rewards/margins": 2.53125, "rewards/rejected": -1.24609375, "step": 333 }, { "epoch": 0.3694690265486726, "grad_norm": 13.853322982788086, "learning_rate": 3.647739136251673e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.20703125, "logps/chosen": -265.0, "logps/rejected": -308.0, "loss": 0.3455, "rewards/accuracies": 0.765625, "rewards/chosen": 1.1796875, "rewards/margins": 2.234375, "rewards/rejected": -1.0546875, "step": 334 }, { "epoch": 0.37057522123893805, "grad_norm": 15.147529602050781, "learning_rate": 3.639766733771147e-07, "logits/chosen": -1.34375, "logits/rejected": -1.25, "logps/chosen": -250.5, "logps/rejected": -287.0, "loss": 0.3692, "rewards/accuracies": 0.78125, "rewards/chosen": 1.10546875, "rewards/margins": 2.26171875, "rewards/rejected": -1.15625, "step": 335 }, { "epoch": 0.37168141592920356, "grad_norm": 13.71894359588623, "learning_rate": 3.6317796721994903e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.1953125, "logps/chosen": -272.0, "logps/rejected": -268.0, "loss": 0.3311, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.083984375, "rewards/margins": 2.33984375, "rewards/rejected": -1.25390625, "step": 336 }, { "epoch": 0.372787610619469, "grad_norm": 12.683527946472168, "learning_rate": 3.623778054262164e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.23046875, "logps/chosen": -268.0, "logps/rejected": -275.0, "loss": 0.302, "rewards/accuracies": 0.8125, "rewards/chosen": 1.26171875, "rewards/margins": 2.3515625, "rewards/rejected": -1.09375, "step": 337 }, { "epoch": 0.37389380530973454, "grad_norm": 12.87300968170166, "learning_rate": 3.6157619828718473e-07, "logits/chosen": -1.21875, "logits/rejected": -1.140625, "logps/chosen": -248.0, "logps/rejected": -249.5, "loss": 0.3173, "rewards/accuracies": 0.828125, "rewards/chosen": 0.939453125, "rewards/margins": 2.3046875, "rewards/rejected": -1.36328125, "step": 338 }, { "epoch": 0.375, "grad_norm": 13.38857650756836, "learning_rate": 3.6077315611271095e-07, "logits/chosen": -1.359375, "logits/rejected": -1.28125, "logps/chosen": -247.5, "logps/rejected": -256.0, "loss": 0.3028, "rewards/accuracies": 0.84375, "rewards/chosen": 1.03125, "rewards/margins": 2.3671875, "rewards/rejected": -1.33984375, "step": 339 }, { "epoch": 0.37610619469026546, "grad_norm": 13.413825988769531, "learning_rate": 3.5996868923110883e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.1796875, "logps/chosen": -229.0, "logps/rejected": -270.5, "loss": 0.3433, "rewards/accuracies": 0.796875, "rewards/chosen": 1.015625, "rewards/margins": 2.2421875, "rewards/rejected": -1.2265625, "step": 340 }, { "epoch": 0.377212389380531, "grad_norm": 14.234886169433594, "learning_rate": 3.59162807989016e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.13671875, "logps/chosen": -248.5, "logps/rejected": -233.5, "loss": 0.3125, "rewards/accuracies": 0.78125, "rewards/chosen": 1.125, "rewards/margins": 2.6796875, "rewards/rejected": -1.55078125, "step": 341 }, { "epoch": 0.37831858407079644, "grad_norm": 14.096612930297852, "learning_rate": 3.583555227512607e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.12109375, "logps/chosen": -238.5, "logps/rejected": -265.0, "loss": 0.356, "rewards/accuracies": 0.765625, "rewards/chosen": 1.1875, "rewards/margins": 2.65625, "rewards/rejected": -1.47265625, "step": 342 }, { "epoch": 0.37942477876106195, "grad_norm": 13.053836822509766, "learning_rate": 3.5754684390072886e-07, "logits/chosen": -1.16796875, "logits/rejected": -1.0625, "logps/chosen": -241.0, "logps/rejected": -280.0, "loss": 0.3579, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.033203125, "rewards/margins": 2.10546875, "rewards/rejected": -1.07421875, "step": 343 }, { "epoch": 0.3805309734513274, "grad_norm": 13.854188919067383, "learning_rate": 3.5673678183823024e-07, "logits/chosen": -1.13671875, "logits/rejected": -1.0546875, "logps/chosen": -284.0, "logps/rejected": -302.0, "loss": 0.3128, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.25390625, "rewards/margins": 2.5078125, "rewards/rejected": -1.2578125, "step": 344 }, { "epoch": 0.38163716814159293, "grad_norm": 15.331599235534668, "learning_rate": 3.559253469823647e-07, "logits/chosen": -1.09375, "logits/rejected": -1.048828125, "logps/chosen": -250.5, "logps/rejected": -278.0, "loss": 0.3974, "rewards/accuracies": 0.7109375, "rewards/chosen": 1.05078125, "rewards/margins": 2.3203125, "rewards/rejected": -1.2734375, "step": 345 }, { "epoch": 0.3827433628318584, "grad_norm": 13.891538619995117, "learning_rate": 3.5511254976938834e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.1484375, "logps/chosen": -269.0, "logps/rejected": -274.0, "loss": 0.3552, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.0703125, "rewards/margins": 2.125, "rewards/rejected": -1.05078125, "step": 346 }, { "epoch": 0.3838495575221239, "grad_norm": 14.387117385864258, "learning_rate": 3.542984006530792e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.16015625, "logps/chosen": -237.0, "logps/rejected": -272.0, "loss": 0.3257, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.044921875, "rewards/margins": 2.546875, "rewards/rejected": -1.5078125, "step": 347 }, { "epoch": 0.38495575221238937, "grad_norm": 14.693575859069824, "learning_rate": 3.534829101046027e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.19921875, "logps/chosen": -253.0, "logps/rejected": -271.5, "loss": 0.4006, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.859375, "rewards/margins": 1.90234375, "rewards/rejected": -1.046875, "step": 348 }, { "epoch": 0.3860619469026549, "grad_norm": 14.893670082092285, "learning_rate": 3.5266608861237723e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.16015625, "logps/chosen": -257.0, "logps/rejected": -273.0, "loss": 0.3469, "rewards/accuracies": 0.78125, "rewards/chosen": 0.845703125, "rewards/margins": 2.171875, "rewards/rejected": -1.32421875, "step": 349 }, { "epoch": 0.38716814159292035, "grad_norm": 13.093351364135742, "learning_rate": 3.518479466819389e-07, "logits/chosen": -1.1484375, "logits/rejected": -1.20703125, "logps/chosen": -251.0, "logps/rejected": -290.0, "loss": 0.3118, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.056640625, "rewards/margins": 2.5546875, "rewards/rejected": -1.5, "step": 350 }, { "epoch": 0.38716814159292035, "eval_logits/chosen": -1.260883092880249, "eval_logits/rejected": -1.1730799674987793, "eval_logps/chosen": -250.82586669921875, "eval_logps/rejected": -272.52239990234375, "eval_loss": 0.3436649739742279, "eval_rewards/accuracies": 0.7924543023109436, "eval_rewards/chosen": 1.0035176277160645, "eval_rewards/margins": 2.3350045680999756, "eval_rewards/rejected": -1.3310012817382812, "eval_runtime": 192.8803, "eval_samples_per_second": 66.637, "eval_steps_per_second": 1.042, "step": 350 }, { "epoch": 0.38827433628318586, "grad_norm": 17.082809448242188, "learning_rate": 3.510284948358068e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.1328125, "logps/chosen": -264.0, "logps/rejected": -286.0, "loss": 0.4283, "rewards/accuracies": 0.703125, "rewards/chosen": 0.78125, "rewards/margins": 2.02734375, "rewards/rejected": -1.25, "step": 351 }, { "epoch": 0.3893805309734513, "grad_norm": 14.743417739868164, "learning_rate": 3.5020774361334744e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.28125, "logps/chosen": -235.0, "logps/rejected": -291.0, "loss": 0.3538, "rewards/accuracies": 0.765625, "rewards/chosen": 1.1015625, "rewards/margins": 2.375, "rewards/rejected": -1.26953125, "step": 352 }, { "epoch": 0.39048672566371684, "grad_norm": 12.676244735717773, "learning_rate": 3.49385703570639e-07, "logits/chosen": -1.35546875, "logits/rejected": -1.19921875, "logps/chosen": -243.5, "logps/rejected": -254.0, "loss": 0.2715, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.2578125, "rewards/margins": 2.7578125, "rewards/rejected": -1.5, "step": 353 }, { "epoch": 0.3915929203539823, "grad_norm": 13.607057571411133, "learning_rate": 3.485623852803361e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.12890625, "logps/chosen": -248.0, "logps/rejected": -263.5, "loss": 0.3456, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.962890625, "rewards/margins": 2.46875, "rewards/rejected": -1.5, "step": 354 }, { "epoch": 0.3926991150442478, "grad_norm": 12.650440216064453, "learning_rate": 3.4773779933153343e-07, "logits/chosen": -1.296875, "logits/rejected": -1.2578125, "logps/chosen": -222.5, "logps/rejected": -240.0, "loss": 0.3298, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.037109375, "rewards/margins": 2.265625, "rewards/rejected": -1.2265625, "step": 355 }, { "epoch": 0.3938053097345133, "grad_norm": 13.600220680236816, "learning_rate": 3.4691195632962957e-07, "logits/chosen": -1.41015625, "logits/rejected": -1.1796875, "logps/chosen": -225.0, "logps/rejected": -251.0, "loss": 0.3439, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.978515625, "rewards/margins": 2.3515625, "rewards/rejected": -1.375, "step": 356 }, { "epoch": 0.39491150442477874, "grad_norm": 14.466880798339844, "learning_rate": 3.4608486689619083e-07, "logits/chosen": -1.1640625, "logits/rejected": -1.171875, "logps/chosen": -252.0, "logps/rejected": -254.0, "loss": 0.3437, "rewards/accuracies": 0.78125, "rewards/chosen": 0.904296875, "rewards/margins": 2.4375, "rewards/rejected": -1.53125, "step": 357 }, { "epoch": 0.39601769911504425, "grad_norm": 13.527942657470703, "learning_rate": 3.4525654166881426e-07, "logits/chosen": -1.34375, "logits/rejected": -1.19140625, "logps/chosen": -256.5, "logps/rejected": -281.0, "loss": 0.3267, "rewards/accuracies": 0.796875, "rewards/chosen": 0.962890625, "rewards/margins": 2.40625, "rewards/rejected": -1.44140625, "step": 358 }, { "epoch": 0.3971238938053097, "grad_norm": 13.268882751464844, "learning_rate": 3.4442699130099116e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.1328125, "logps/chosen": -268.0, "logps/rejected": -297.0, "loss": 0.293, "rewards/accuracies": 0.828125, "rewards/chosen": 0.990234375, "rewards/margins": 2.6171875, "rewards/rejected": -1.6328125, "step": 359 }, { "epoch": 0.39823008849557523, "grad_norm": 14.783308982849121, "learning_rate": 3.435962264619702e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.12109375, "logps/chosen": -239.5, "logps/rejected": -278.0, "loss": 0.3438, "rewards/accuracies": 0.765625, "rewards/chosen": 0.765625, "rewards/margins": 2.25390625, "rewards/rejected": -1.484375, "step": 360 }, { "epoch": 0.3993362831858407, "grad_norm": 14.32043743133545, "learning_rate": 3.427642578366194e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.19140625, "logps/chosen": -256.5, "logps/rejected": -278.0, "loss": 0.3622, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.9375, "rewards/margins": 2.359375, "rewards/rejected": -1.421875, "step": 361 }, { "epoch": 0.4004424778761062, "grad_norm": 175.1194305419922, "learning_rate": 3.419310961252897e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.10546875, "logps/chosen": -234.5, "logps/rejected": -354.0, "loss": 0.3211, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.06640625, "rewards/margins": 2.6015625, "rewards/rejected": -1.53515625, "step": 362 }, { "epoch": 0.40154867256637167, "grad_norm": 13.051432609558105, "learning_rate": 3.4109675204367686e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.06640625, "logps/chosen": -269.0, "logps/rejected": -313.0, "loss": 0.3161, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.140625, "rewards/margins": 2.5703125, "rewards/rejected": -1.4296875, "step": 363 }, { "epoch": 0.4026548672566372, "grad_norm": 12.166739463806152, "learning_rate": 3.4026123632268354e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.09375, "logps/chosen": -235.0, "logps/rejected": -244.0, "loss": 0.3185, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.6875, "rewards/margins": 2.1953125, "rewards/rejected": -1.50390625, "step": 364 }, { "epoch": 0.40376106194690264, "grad_norm": 13.57703971862793, "learning_rate": 3.3942455970828146e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.11328125, "logps/chosen": -249.5, "logps/rejected": -265.0, "loss": 0.3227, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.013671875, "rewards/margins": 2.546875, "rewards/rejected": -1.52734375, "step": 365 }, { "epoch": 0.40486725663716816, "grad_norm": 12.446300506591797, "learning_rate": 3.38586732961373e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.21875, "logps/chosen": -232.5, "logps/rejected": -237.5, "loss": 0.3575, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.560546875, "rewards/margins": 2.234375, "rewards/rejected": -1.6796875, "step": 366 }, { "epoch": 0.4059734513274336, "grad_norm": 13.510902404785156, "learning_rate": 3.3774776685765327e-07, "logits/chosen": -1.3125, "logits/rejected": -1.265625, "logps/chosen": -242.5, "logps/rejected": -264.5, "loss": 0.3312, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.79296875, "rewards/margins": 2.3671875, "rewards/rejected": -1.57421875, "step": 367 }, { "epoch": 0.40707964601769914, "grad_norm": 12.729259490966797, "learning_rate": 3.3690767218747104e-07, "logits/chosen": -1.1015625, "logits/rejected": -1.1328125, "logps/chosen": -243.5, "logps/rejected": -272.0, "loss": 0.2871, "rewards/accuracies": 0.828125, "rewards/chosen": 1.12109375, "rewards/margins": 2.7734375, "rewards/rejected": -1.6484375, "step": 368 }, { "epoch": 0.4081858407079646, "grad_norm": 14.687115669250488, "learning_rate": 3.3606645975569e-07, "logits/chosen": -1.34765625, "logits/rejected": -1.078125, "logps/chosen": -248.5, "logps/rejected": -254.5, "loss": 0.3694, "rewards/accuracies": 0.8125, "rewards/chosen": 0.65625, "rewards/margins": 2.125, "rewards/rejected": -1.47265625, "step": 369 }, { "epoch": 0.4092920353982301, "grad_norm": 11.254127502441406, "learning_rate": 3.3522414038155016e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.12109375, "logps/chosen": -225.0, "logps/rejected": -259.5, "loss": 0.2835, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.970703125, "rewards/margins": 2.65625, "rewards/rejected": -1.68359375, "step": 370 }, { "epoch": 0.4103982300884956, "grad_norm": 15.060591697692871, "learning_rate": 3.343807248985283e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.16015625, "logps/chosen": -240.0, "logps/rejected": -265.5, "loss": 0.3759, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.689453125, "rewards/margins": 2.2109375, "rewards/rejected": -1.5234375, "step": 371 }, { "epoch": 0.41150442477876104, "grad_norm": 11.981417655944824, "learning_rate": 3.335362241541988e-07, "logits/chosen": -1.171875, "logits/rejected": -1.09375, "logps/chosen": -260.0, "logps/rejected": -280.0, "loss": 0.3012, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.732421875, "rewards/margins": 2.3828125, "rewards/rejected": -1.64453125, "step": 372 }, { "epoch": 0.41261061946902655, "grad_norm": 13.114727020263672, "learning_rate": 3.32690649010094e-07, "logits/chosen": -1.265625, "logits/rejected": -1.16796875, "logps/chosen": -244.5, "logps/rejected": -267.5, "loss": 0.2875, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.927734375, "rewards/margins": 2.8125, "rewards/rejected": -1.8828125, "step": 373 }, { "epoch": 0.413716814159292, "grad_norm": 13.435688972473145, "learning_rate": 3.3184401034156484e-07, "logits/chosen": -1.16796875, "logits/rejected": -1.015625, "logps/chosen": -263.0, "logps/rejected": -271.0, "loss": 0.344, "rewards/accuracies": 0.796875, "rewards/chosen": 0.658203125, "rewards/margins": 2.2890625, "rewards/rejected": -1.625, "step": 374 }, { "epoch": 0.41482300884955753, "grad_norm": 16.25585174560547, "learning_rate": 3.3099631903764064e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.125, "logps/chosen": -270.0, "logps/rejected": -291.0, "loss": 0.4301, "rewards/accuracies": 0.71875, "rewards/chosen": 0.572265625, "rewards/margins": 1.765625, "rewards/rejected": -1.1953125, "step": 375 }, { "epoch": 0.415929203539823, "grad_norm": 13.53650188446045, "learning_rate": 3.3014758600088923e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.2265625, "logps/chosen": -232.0, "logps/rejected": -266.5, "loss": 0.3326, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.755859375, "rewards/margins": 2.234375, "rewards/rejected": -1.48046875, "step": 376 }, { "epoch": 0.4170353982300885, "grad_norm": 12.601165771484375, "learning_rate": 3.2929782214727653e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.18359375, "logps/chosen": -246.5, "logps/rejected": -263.5, "loss": 0.3436, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.771484375, "rewards/margins": 2.515625, "rewards/rejected": -1.7421875, "step": 377 }, { "epoch": 0.41814159292035397, "grad_norm": 13.9395112991333, "learning_rate": 3.2844703840602636e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.30859375, "logps/chosen": -243.5, "logps/rejected": -262.5, "loss": 0.3515, "rewards/accuracies": 0.765625, "rewards/chosen": 0.92578125, "rewards/margins": 2.390625, "rewards/rejected": -1.46484375, "step": 378 }, { "epoch": 0.4192477876106195, "grad_norm": 13.737992286682129, "learning_rate": 3.2759524571948e-07, "logits/chosen": -1.3125, "logits/rejected": -1.11328125, "logps/chosen": -258.0, "logps/rejected": -293.0, "loss": 0.2964, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.970703125, "rewards/margins": 2.5859375, "rewards/rejected": -1.61328125, "step": 379 }, { "epoch": 0.42035398230088494, "grad_norm": 14.736825942993164, "learning_rate": 3.26742455042955e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.2109375, "logps/chosen": -247.5, "logps/rejected": -238.0, "loss": 0.3616, "rewards/accuracies": 0.78125, "rewards/chosen": 0.634765625, "rewards/margins": 2.1875, "rewards/rejected": -1.55078125, "step": 380 }, { "epoch": 0.42146017699115046, "grad_norm": 14.469903945922852, "learning_rate": 3.2588867734460464e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.15234375, "logps/chosen": -252.0, "logps/rejected": -260.5, "loss": 0.355, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9296875, "rewards/margins": 2.5546875, "rewards/rejected": -1.62109375, "step": 381 }, { "epoch": 0.4225663716814159, "grad_norm": 14.506092071533203, "learning_rate": 3.250339236052767e-07, "logits/chosen": -1.140625, "logits/rejected": -1.18359375, "logps/chosen": -261.0, "logps/rejected": -289.0, "loss": 0.3673, "rewards/accuracies": 0.75, "rewards/chosen": 0.951171875, "rewards/margins": 2.3359375, "rewards/rejected": -1.37890625, "step": 382 }, { "epoch": 0.42367256637168144, "grad_norm": 13.806063652038574, "learning_rate": 3.2417820481837256e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.16015625, "logps/chosen": -254.0, "logps/rejected": -274.0, "loss": 0.3272, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.712890625, "rewards/margins": 2.40625, "rewards/rejected": -1.69140625, "step": 383 }, { "epoch": 0.4247787610619469, "grad_norm": 13.990365982055664, "learning_rate": 3.2332153198970517e-07, "logits/chosen": -1.28125, "logits/rejected": -1.2109375, "logps/chosen": -250.5, "logps/rejected": -279.0, "loss": 0.334, "rewards/accuracies": 0.8125, "rewards/chosen": 0.66796875, "rewards/margins": 2.1640625, "rewards/rejected": -1.49609375, "step": 384 }, { "epoch": 0.4258849557522124, "grad_norm": 15.526028633117676, "learning_rate": 3.2246391613735815e-07, "logits/chosen": -1.390625, "logits/rejected": -1.1953125, "logps/chosen": -253.0, "logps/rejected": -258.5, "loss": 0.3283, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.92578125, "rewards/margins": 2.421875, "rewards/rejected": -1.49609375, "step": 385 }, { "epoch": 0.4269911504424779, "grad_norm": 13.068387985229492, "learning_rate": 3.2160536829154356e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.23046875, "logps/chosen": -248.5, "logps/rejected": -286.0, "loss": 0.2813, "rewards/accuracies": 0.859375, "rewards/chosen": 1.46875, "rewards/margins": 3.0078125, "rewards/rejected": -1.54296875, "step": 386 }, { "epoch": 0.4280973451327434, "grad_norm": 14.58014965057373, "learning_rate": 3.207458994944606e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.16796875, "logps/chosen": -261.0, "logps/rejected": -269.0, "loss": 0.3732, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.6640625, "rewards/margins": 2.0546875, "rewards/rejected": -1.390625, "step": 387 }, { "epoch": 0.42920353982300885, "grad_norm": 13.451835632324219, "learning_rate": 3.1988552080015294e-07, "logits/chosen": -1.33984375, "logits/rejected": -1.15234375, "logps/chosen": -256.5, "logps/rejected": -264.0, "loss": 0.3112, "rewards/accuracies": 0.828125, "rewards/chosen": 1.11328125, "rewards/margins": 2.515625, "rewards/rejected": -1.40234375, "step": 388 }, { "epoch": 0.4303097345132743, "grad_norm": 15.350495338439941, "learning_rate": 3.1902424327436725e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.0625, "logps/chosen": -273.0, "logps/rejected": -264.0, "loss": 0.3406, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.712890625, "rewards/margins": 2.1484375, "rewards/rejected": -1.44140625, "step": 389 }, { "epoch": 0.4314159292035398, "grad_norm": 17.061744689941406, "learning_rate": 3.1816207799440996e-07, "logits/chosen": -1.265625, "logits/rejected": -1.12890625, "logps/chosen": -278.5, "logps/rejected": -318.0, "loss": 0.3654, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.16015625, "rewards/margins": 2.4375, "rewards/rejected": -1.28125, "step": 390 }, { "epoch": 0.4325221238938053, "grad_norm": 16.2224063873291, "learning_rate": 3.1729903604900595e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.08203125, "logps/chosen": -244.0, "logps/rejected": -281.0, "loss": 0.3328, "rewards/accuracies": 0.796875, "rewards/chosen": 0.923828125, "rewards/margins": 2.5234375, "rewards/rejected": -1.59375, "step": 391 }, { "epoch": 0.4336283185840708, "grad_norm": 13.922826766967773, "learning_rate": 3.1643512853815487e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.25390625, "logps/chosen": -249.0, "logps/rejected": -280.0, "loss": 0.3626, "rewards/accuracies": 0.796875, "rewards/chosen": 0.89453125, "rewards/margins": 2.12890625, "rewards/rejected": -1.2421875, "step": 392 }, { "epoch": 0.43473451327433627, "grad_norm": 14.233848571777344, "learning_rate": 3.15570366572989e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.18359375, "logps/chosen": -246.5, "logps/rejected": -255.5, "loss": 0.33, "rewards/accuracies": 0.78125, "rewards/chosen": 0.875, "rewards/margins": 2.2421875, "rewards/rejected": -1.3671875, "step": 393 }, { "epoch": 0.4358407079646018, "grad_norm": 15.168607711791992, "learning_rate": 3.147047612756302e-07, "logits/chosen": -1.21875, "logits/rejected": -1.17578125, "logps/chosen": -281.0, "logps/rejected": -290.0, "loss": 0.345, "rewards/accuracies": 0.828125, "rewards/chosen": 0.892578125, "rewards/margins": 2.3125, "rewards/rejected": -1.421875, "step": 394 }, { "epoch": 0.43694690265486724, "grad_norm": 14.825115203857422, "learning_rate": 3.138383237790467e-07, "logits/chosen": -1.203125, "logits/rejected": -1.1484375, "logps/chosen": -250.5, "logps/rejected": -272.0, "loss": 0.3428, "rewards/accuracies": 0.796875, "rewards/chosen": 0.98828125, "rewards/margins": 2.4765625, "rewards/rejected": -1.484375, "step": 395 }, { "epoch": 0.43805309734513276, "grad_norm": 12.729918479919434, "learning_rate": 3.129710652269103e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.1171875, "logps/chosen": -230.0, "logps/rejected": -255.5, "loss": 0.2864, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.20703125, "rewards/margins": 2.875, "rewards/rejected": -1.66796875, "step": 396 }, { "epoch": 0.4391592920353982, "grad_norm": 12.986177444458008, "learning_rate": 3.1210299677345253e-07, "logits/chosen": -1.17578125, "logits/rejected": -1.140625, "logps/chosen": -257.0, "logps/rejected": -279.0, "loss": 0.3394, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.013671875, "rewards/margins": 2.46875, "rewards/rejected": -1.45703125, "step": 397 }, { "epoch": 0.44026548672566373, "grad_norm": 15.79924488067627, "learning_rate": 3.1123412958332153e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.15234375, "logps/chosen": -248.0, "logps/rejected": -275.0, "loss": 0.3804, "rewards/accuracies": 0.8125, "rewards/chosen": 0.84375, "rewards/margins": 2.34375, "rewards/rejected": -1.5, "step": 398 }, { "epoch": 0.4413716814159292, "grad_norm": 14.248608589172363, "learning_rate": 3.1036447483143834e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.22265625, "logps/chosen": -261.5, "logps/rejected": -275.0, "loss": 0.3299, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.99609375, "rewards/margins": 2.6015625, "rewards/rejected": -1.609375, "step": 399 }, { "epoch": 0.4424778761061947, "grad_norm": 14.430908203125, "learning_rate": 3.094940437028535e-07, "logits/chosen": -1.1171875, "logits/rejected": -1.125, "logps/chosen": -250.5, "logps/rejected": -251.5, "loss": 0.3726, "rewards/accuracies": 0.765625, "rewards/chosen": 0.953125, "rewards/margins": 2.26953125, "rewards/rejected": -1.3203125, "step": 400 }, { "epoch": 0.4424778761061947, "eval_logits/chosen": -1.2572294473648071, "eval_logits/rejected": -1.1652674674987793, "eval_logps/chosen": -250.63681030273438, "eval_logps/rejected": -273.5472717285156, "eval_loss": 0.3369702994823456, "eval_rewards/accuracies": 0.7978180646896362, "eval_rewards/chosen": 1.024176001548767, "eval_rewards/margins": 2.449626922607422, "eval_rewards/rejected": -1.4251010417938232, "eval_runtime": 193.115, "eval_samples_per_second": 66.556, "eval_steps_per_second": 1.041, "step": 400 }, { "epoch": 0.4435840707964602, "grad_norm": 14.084267616271973, "learning_rate": 3.086228473926024e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.26171875, "logps/chosen": -242.5, "logps/rejected": -257.0, "loss": 0.3172, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.12109375, "rewards/margins": 2.5390625, "rewards/rejected": -1.4140625, "step": 401 }, { "epoch": 0.4446902654867257, "grad_norm": 13.272000312805176, "learning_rate": 3.077508971055623e-07, "logits/chosen": -1.1015625, "logits/rejected": -1.171875, "logps/chosen": -246.5, "logps/rejected": -295.0, "loss": 0.2771, "rewards/accuracies": 0.828125, "rewards/chosen": 1.2265625, "rewards/margins": 2.90625, "rewards/rejected": -1.6796875, "step": 402 }, { "epoch": 0.44579646017699115, "grad_norm": 13.017451286315918, "learning_rate": 3.0687820405630736e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.2265625, "logps/chosen": -258.5, "logps/rejected": -286.0, "loss": 0.2997, "rewards/accuracies": 0.828125, "rewards/chosen": 1.3125, "rewards/margins": 2.8125, "rewards/rejected": -1.5, "step": 403 }, { "epoch": 0.4469026548672566, "grad_norm": 11.719470024108887, "learning_rate": 3.060047794689649e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.171875, "logps/chosen": -246.0, "logps/rejected": -252.0, "loss": 0.273, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.154296875, "rewards/margins": 2.7109375, "rewards/rejected": -1.55078125, "step": 404 }, { "epoch": 0.4480088495575221, "grad_norm": 12.74482250213623, "learning_rate": 3.0513063457707106e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.17578125, "logps/chosen": -238.5, "logps/rejected": -227.0, "loss": 0.3567, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.908203125, "rewards/margins": 2.234375, "rewards/rejected": -1.32421875, "step": 405 }, { "epoch": 0.4491150442477876, "grad_norm": 14.130414962768555, "learning_rate": 3.0425578062342577e-07, "logits/chosen": -1.1484375, "logits/rejected": -1.18359375, "logps/chosen": -241.5, "logps/rejected": -268.0, "loss": 0.3743, "rewards/accuracies": 0.71875, "rewards/chosen": 1.017578125, "rewards/margins": 2.203125, "rewards/rejected": -1.18359375, "step": 406 }, { "epoch": 0.4502212389380531, "grad_norm": 15.725412368774414, "learning_rate": 3.03380228859949e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.13671875, "logps/chosen": -271.5, "logps/rejected": -291.0, "loss": 0.3421, "rewards/accuracies": 0.765625, "rewards/chosen": 1.140625, "rewards/margins": 2.4140625, "rewards/rejected": -1.26953125, "step": 407 }, { "epoch": 0.45132743362831856, "grad_norm": 13.073963165283203, "learning_rate": 3.0250399054753526e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.1328125, "logps/chosen": -271.0, "logps/rejected": -265.0, "loss": 0.3024, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.078125, "rewards/margins": 2.515625, "rewards/rejected": -1.4375, "step": 408 }, { "epoch": 0.4524336283185841, "grad_norm": 12.850948333740234, "learning_rate": 3.016270769559093e-07, "logits/chosen": -1.203125, "logits/rejected": -1.06640625, "logps/chosen": -258.0, "logps/rejected": -275.0, "loss": 0.3189, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.041015625, "rewards/margins": 2.234375, "rewards/rejected": -1.1953125, "step": 409 }, { "epoch": 0.45353982300884954, "grad_norm": 13.47533130645752, "learning_rate": 3.007494993634808e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.16015625, "logps/chosen": -259.0, "logps/rejected": -269.0, "loss": 0.3222, "rewards/accuracies": 0.828125, "rewards/chosen": 1.125, "rewards/margins": 2.53125, "rewards/rejected": -1.41015625, "step": 410 }, { "epoch": 0.45464601769911506, "grad_norm": 13.043135643005371, "learning_rate": 2.9987126905719965e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.1484375, "logps/chosen": -265.5, "logps/rejected": -272.5, "loss": 0.3374, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9140625, "rewards/margins": 2.3125, "rewards/rejected": -1.40234375, "step": 411 }, { "epoch": 0.4557522123893805, "grad_norm": 14.49729061126709, "learning_rate": 2.989923973324105e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.08984375, "logps/chosen": -252.5, "logps/rejected": -281.0, "loss": 0.3668, "rewards/accuracies": 0.765625, "rewards/chosen": 1.09765625, "rewards/margins": 2.265625, "rewards/rejected": -1.1640625, "step": 412 }, { "epoch": 0.45685840707964603, "grad_norm": 15.509222030639648, "learning_rate": 2.9811289549270745e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.2109375, "logps/chosen": -250.5, "logps/rejected": -286.0, "loss": 0.3665, "rewards/accuracies": 0.7421875, "rewards/chosen": 1.203125, "rewards/margins": 2.5, "rewards/rejected": -1.2890625, "step": 413 }, { "epoch": 0.4579646017699115, "grad_norm": 13.909936904907227, "learning_rate": 2.9723277484978917e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.2109375, "logps/chosen": -270.0, "logps/rejected": -291.0, "loss": 0.2915, "rewards/accuracies": 0.828125, "rewards/chosen": 1.17578125, "rewards/margins": 2.765625, "rewards/rejected": -1.58984375, "step": 414 }, { "epoch": 0.459070796460177, "grad_norm": 13.325665473937988, "learning_rate": 2.963520467233127e-07, "logits/chosen": -1.45703125, "logits/rejected": -1.19140625, "logps/chosen": -252.0, "logps/rejected": -262.5, "loss": 0.3212, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.07421875, "rewards/margins": 2.4140625, "rewards/rejected": -1.34375, "step": 415 }, { "epoch": 0.46017699115044247, "grad_norm": 15.239510536193848, "learning_rate": 2.954707224407485e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.24609375, "logps/chosen": -261.5, "logps/rejected": -285.0, "loss": 0.3534, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0625, "rewards/margins": 2.3046875, "rewards/rejected": -1.2421875, "step": 416 }, { "epoch": 0.461283185840708, "grad_norm": 15.735715866088867, "learning_rate": 2.945888133372343e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.12890625, "logps/chosen": -287.0, "logps/rejected": -288.0, "loss": 0.3967, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.88671875, "rewards/margins": 2.125, "rewards/rejected": -1.23828125, "step": 417 }, { "epoch": 0.46238938053097345, "grad_norm": 15.247976303100586, "learning_rate": 2.937063307554295e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.21484375, "logps/chosen": -226.0, "logps/rejected": -250.0, "loss": 0.3726, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.12890625, "rewards/margins": 2.375, "rewards/rejected": -1.24609375, "step": 418 }, { "epoch": 0.46349557522123896, "grad_norm": 12.819127082824707, "learning_rate": 2.9282328604536937e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.2109375, "logps/chosen": -249.5, "logps/rejected": -271.0, "loss": 0.3065, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.14453125, "rewards/margins": 2.578125, "rewards/rejected": -1.4296875, "step": 419 }, { "epoch": 0.4646017699115044, "grad_norm": 13.217004776000977, "learning_rate": 2.9193969056431907e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.15234375, "logps/chosen": -254.5, "logps/rejected": -270.0, "loss": 0.3139, "rewards/accuracies": 0.828125, "rewards/chosen": 1.09765625, "rewards/margins": 2.78125, "rewards/rejected": -1.68359375, "step": 420 }, { "epoch": 0.4657079646017699, "grad_norm": 14.80079174041748, "learning_rate": 2.910555556766272e-07, "logits/chosen": -1.4375, "logits/rejected": -1.234375, "logps/chosen": -226.5, "logps/rejected": -263.0, "loss": 0.3987, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.94140625, "rewards/margins": 2.12109375, "rewards/rejected": -1.17578125, "step": 421 }, { "epoch": 0.4668141592920354, "grad_norm": 13.704545974731445, "learning_rate": 2.9017089275358014e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.1328125, "logps/chosen": -271.0, "logps/rejected": -287.0, "loss": 0.3016, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9609375, "rewards/margins": 2.546875, "rewards/rejected": -1.5859375, "step": 422 }, { "epoch": 0.46792035398230086, "grad_norm": 14.536904335021973, "learning_rate": 2.8928571317325564e-07, "logits/chosen": -1.25, "logits/rejected": -1.0703125, "logps/chosen": -279.0, "logps/rejected": -291.0, "loss": 0.3234, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.19140625, "rewards/margins": 2.6171875, "rewards/rejected": -1.4296875, "step": 423 }, { "epoch": 0.4690265486725664, "grad_norm": 15.45283317565918, "learning_rate": 2.8840002832037625e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.1875, "logps/chosen": -261.0, "logps/rejected": -283.0, "loss": 0.365, "rewards/accuracies": 0.78125, "rewards/chosen": 1.171875, "rewards/margins": 2.234375, "rewards/rejected": -1.0703125, "step": 424 }, { "epoch": 0.47013274336283184, "grad_norm": 14.761160850524902, "learning_rate": 2.8751384958616316e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.1484375, "logps/chosen": -257.0, "logps/rejected": -285.0, "loss": 0.3295, "rewards/accuracies": 0.796875, "rewards/chosen": 1.2578125, "rewards/margins": 2.9296875, "rewards/rejected": -1.671875, "step": 425 }, { "epoch": 0.47123893805309736, "grad_norm": 14.022229194641113, "learning_rate": 2.8662718836818964e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.1796875, "logps/chosen": -249.5, "logps/rejected": -275.0, "loss": 0.3165, "rewards/accuracies": 0.8125, "rewards/chosen": 0.984375, "rewards/margins": 2.53125, "rewards/rejected": -1.546875, "step": 426 }, { "epoch": 0.4723451327433628, "grad_norm": 13.972189903259277, "learning_rate": 2.8574005607023444e-07, "logits/chosen": -1.328125, "logits/rejected": -1.15234375, "logps/chosen": -253.0, "logps/rejected": -286.0, "loss": 0.3595, "rewards/accuracies": 0.765625, "rewards/chosen": 1.17578125, "rewards/margins": 2.3359375, "rewards/rejected": -1.16015625, "step": 427 }, { "epoch": 0.47345132743362833, "grad_norm": 13.316848754882812, "learning_rate": 2.848524641021349e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.15625, "logps/chosen": -279.0, "logps/rejected": -304.0, "loss": 0.2876, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.14453125, "rewards/margins": 2.734375, "rewards/rejected": -1.5859375, "step": 428 }, { "epoch": 0.4745575221238938, "grad_norm": 15.443883895874023, "learning_rate": 2.839644238796407e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.14453125, "logps/chosen": -279.0, "logps/rejected": -291.0, "loss": 0.3446, "rewards/accuracies": 0.796875, "rewards/chosen": 1.017578125, "rewards/margins": 2.453125, "rewards/rejected": -1.4375, "step": 429 }, { "epoch": 0.4756637168141593, "grad_norm": 14.372305870056152, "learning_rate": 2.8307594682426637e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.1484375, "logps/chosen": -260.5, "logps/rejected": -309.0, "loss": 0.2813, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0, "rewards/margins": 2.9609375, "rewards/rejected": -1.95703125, "step": 430 }, { "epoch": 0.47676991150442477, "grad_norm": 13.688916206359863, "learning_rate": 2.8218704436314524e-07, "logits/chosen": -1.46875, "logits/rejected": -1.22265625, "logps/chosen": -253.5, "logps/rejected": -276.0, "loss": 0.341, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.798828125, "rewards/margins": 2.171875, "rewards/rejected": -1.37109375, "step": 431 }, { "epoch": 0.4778761061946903, "grad_norm": 12.372815132141113, "learning_rate": 2.8129772792888145e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.1171875, "logps/chosen": -235.0, "logps/rejected": -281.0, "loss": 0.2966, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.869140625, "rewards/margins": 2.6328125, "rewards/rejected": -1.76171875, "step": 432 }, { "epoch": 0.47898230088495575, "grad_norm": 15.202491760253906, "learning_rate": 2.804080089594039e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.109375, "logps/chosen": -260.0, "logps/rejected": -263.0, "loss": 0.3812, "rewards/accuracies": 0.765625, "rewards/chosen": 0.490234375, "rewards/margins": 1.98046875, "rewards/rejected": -1.484375, "step": 433 }, { "epoch": 0.48008849557522126, "grad_norm": 15.252046585083008, "learning_rate": 2.7951789889781845e-07, "logits/chosen": -1.25, "logits/rejected": -1.125, "logps/chosen": -261.0, "logps/rejected": -299.0, "loss": 0.3649, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.919921875, "rewards/margins": 2.328125, "rewards/rejected": -1.40625, "step": 434 }, { "epoch": 0.4811946902654867, "grad_norm": 11.937089920043945, "learning_rate": 2.786274091922611e-07, "logits/chosen": -1.296875, "logits/rejected": -1.1484375, "logps/chosen": -257.0, "logps/rejected": -279.0, "loss": 0.2799, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9375, "rewards/margins": 2.703125, "rewards/rejected": -1.76953125, "step": 435 }, { "epoch": 0.4823008849557522, "grad_norm": 12.86253833770752, "learning_rate": 2.7773655129575043e-07, "logits/chosen": -1.25, "logits/rejected": -1.11328125, "logps/chosen": -237.5, "logps/rejected": -266.5, "loss": 0.3076, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.802734375, "rewards/margins": 2.75, "rewards/rejected": -1.953125, "step": 436 }, { "epoch": 0.4834070796460177, "grad_norm": 12.546619415283203, "learning_rate": 2.7684533666604076e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.05078125, "logps/chosen": -253.5, "logps/rejected": -257.0, "loss": 0.3184, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.689453125, "rewards/margins": 2.3984375, "rewards/rejected": -1.71484375, "step": 437 }, { "epoch": 0.48451327433628316, "grad_norm": 18.16977310180664, "learning_rate": 2.759537767654744e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.1796875, "logps/chosen": -274.0, "logps/rejected": -294.0, "loss": 0.387, "rewards/accuracies": 0.734375, "rewards/chosen": 0.615234375, "rewards/margins": 2.3203125, "rewards/rejected": -1.703125, "step": 438 }, { "epoch": 0.4856194690265487, "grad_norm": 12.303600311279297, "learning_rate": 2.750618830608343e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.06640625, "logps/chosen": -235.0, "logps/rejected": -242.5, "loss": 0.2887, "rewards/accuracies": 0.859375, "rewards/chosen": 0.74609375, "rewards/margins": 2.5546875, "rewards/rejected": -1.8125, "step": 439 }, { "epoch": 0.48672566371681414, "grad_norm": 13.665416717529297, "learning_rate": 2.7416966702319683e-07, "logits/chosen": -1.203125, "logits/rejected": -1.1171875, "logps/chosen": -283.5, "logps/rejected": -304.0, "loss": 0.2974, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.880859375, "rewards/margins": 2.6328125, "rewards/rejected": -1.75390625, "step": 440 }, { "epoch": 0.48783185840707965, "grad_norm": 14.621048927307129, "learning_rate": 2.732771401277838e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.109375, "logps/chosen": -266.5, "logps/rejected": -264.5, "loss": 0.3651, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.4580078125, "rewards/margins": 2.140625, "rewards/rejected": -1.68359375, "step": 441 }, { "epoch": 0.4889380530973451, "grad_norm": 12.872169494628906, "learning_rate": 2.7238431385381523e-07, "logits/chosen": -1.25, "logits/rejected": -1.1484375, "logps/chosen": -245.5, "logps/rejected": -279.0, "loss": 0.3244, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.765625, "rewards/margins": 2.640625, "rewards/rejected": -1.875, "step": 442 }, { "epoch": 0.49004424778761063, "grad_norm": 13.446981430053711, "learning_rate": 2.714911996843616e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.14453125, "logps/chosen": -263.0, "logps/rejected": -300.0, "loss": 0.3075, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.671875, "rewards/margins": 2.5703125, "rewards/rejected": -1.90234375, "step": 443 }, { "epoch": 0.4911504424778761, "grad_norm": 14.347339630126953, "learning_rate": 2.7059780910619617e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.08203125, "logps/chosen": -275.0, "logps/rejected": -310.0, "loss": 0.3042, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.0625, "rewards/margins": 2.9140625, "rewards/rejected": -1.84765625, "step": 444 }, { "epoch": 0.4922566371681416, "grad_norm": 14.27387523651123, "learning_rate": 2.6970415360964716e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.2109375, "logps/chosen": -237.0, "logps/rejected": -258.0, "loss": 0.3354, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.98046875, "rewards/margins": 2.7421875, "rewards/rejected": -1.76171875, "step": 445 }, { "epoch": 0.49336283185840707, "grad_norm": 13.943259239196777, "learning_rate": 2.6881024468845e-07, "logits/chosen": -1.15234375, "logits/rejected": -1.16796875, "logps/chosen": -247.5, "logps/rejected": -275.5, "loss": 0.3356, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.69140625, "rewards/margins": 2.734375, "rewards/rejected": -2.046875, "step": 446 }, { "epoch": 0.4944690265486726, "grad_norm": 15.396841049194336, "learning_rate": 2.679160938395997e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.203125, "logps/chosen": -251.0, "logps/rejected": -283.0, "loss": 0.3342, "rewards/accuracies": 0.828125, "rewards/chosen": 0.9921875, "rewards/margins": 2.6796875, "rewards/rejected": -1.68359375, "step": 447 }, { "epoch": 0.49557522123893805, "grad_norm": 12.922630310058594, "learning_rate": 2.670217125632027e-07, "logits/chosen": -1.21875, "logits/rejected": -1.17578125, "logps/chosen": -248.5, "logps/rejected": -258.5, "loss": 0.3361, "rewards/accuracies": 0.78125, "rewards/chosen": 0.529296875, "rewards/margins": 2.1875, "rewards/rejected": -1.66015625, "step": 448 }, { "epoch": 0.49668141592920356, "grad_norm": 18.911989212036133, "learning_rate": 2.661271123623291e-07, "logits/chosen": -1.3125, "logits/rejected": -1.19921875, "logps/chosen": -288.0, "logps/rejected": -278.0, "loss": 0.4185, "rewards/accuracies": 0.765625, "rewards/chosen": 0.599609375, "rewards/margins": 2.078125, "rewards/rejected": -1.4765625, "step": 449 }, { "epoch": 0.497787610619469, "grad_norm": 16.120946884155273, "learning_rate": 2.652323047428646e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.109375, "logps/chosen": -279.0, "logps/rejected": -303.0, "loss": 0.363, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.9609375, "rewards/margins": 2.609375, "rewards/rejected": -1.65625, "step": 450 }, { "epoch": 0.497787610619469, "eval_logits/chosen": -1.2554415464401245, "eval_logits/rejected": -1.1580379009246826, "eval_logps/chosen": -252.30845642089844, "eval_logps/rejected": -276.3631896972656, "eval_loss": 0.3314497768878937, "eval_rewards/accuracies": 0.8052030205726624, "eval_rewards/chosen": 0.85384601354599, "eval_rewards/margins": 2.567397356033325, "eval_rewards/rejected": -1.7136777639389038, "eval_runtime": 193.0141, "eval_samples_per_second": 66.591, "eval_steps_per_second": 1.041, "step": 450 }, { "epoch": 0.49889380530973454, "grad_norm": 13.481291770935059, "learning_rate": 2.6433730121336283e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.234375, "logps/chosen": -241.5, "logps/rejected": -278.0, "loss": 0.3044, "rewards/accuracies": 0.8125, "rewards/chosen": 0.791015625, "rewards/margins": 2.7578125, "rewards/rejected": -1.9765625, "step": 451 }, { "epoch": 0.5, "grad_norm": 15.87637996673584, "learning_rate": 2.6344211328489696e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.1953125, "logps/chosen": -269.0, "logps/rejected": -291.0, "loss": 0.3646, "rewards/accuracies": 0.765625, "rewards/chosen": 0.68359375, "rewards/margins": 2.28515625, "rewards/rejected": -1.6015625, "step": 452 }, { "epoch": 0.5011061946902655, "grad_norm": 12.020176887512207, "learning_rate": 2.625467524709118e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.10546875, "logps/chosen": -255.0, "logps/rejected": -283.0, "loss": 0.2739, "rewards/accuracies": 0.84375, "rewards/chosen": 0.91796875, "rewards/margins": 2.8828125, "rewards/rejected": -1.96484375, "step": 453 }, { "epoch": 0.5022123893805309, "grad_norm": 13.10580825805664, "learning_rate": 2.616512302870757e-07, "logits/chosen": -1.23046875, "logits/rejected": -1.109375, "logps/chosen": -280.0, "logps/rejected": -286.0, "loss": 0.33, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.7685546875, "rewards/margins": 2.3203125, "rewards/rejected": -1.55078125, "step": 454 }, { "epoch": 0.5033185840707964, "grad_norm": 15.729186058044434, "learning_rate": 2.607555582511326e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.140625, "logps/chosen": -289.0, "logps/rejected": -285.0, "loss": 0.3862, "rewards/accuracies": 0.734375, "rewards/chosen": 0.671875, "rewards/margins": 2.1796875, "rewards/rejected": -1.50390625, "step": 455 }, { "epoch": 0.504424778761062, "grad_norm": 13.986113548278809, "learning_rate": 2.5985974788275374e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.203125, "logps/chosen": -230.5, "logps/rejected": -267.0, "loss": 0.3423, "rewards/accuracies": 0.765625, "rewards/chosen": 1.1640625, "rewards/margins": 2.8515625, "rewards/rejected": -1.6875, "step": 456 }, { "epoch": 0.5055309734513275, "grad_norm": 14.71418285369873, "learning_rate": 2.5896381070338933e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.16796875, "logps/chosen": -274.0, "logps/rejected": -273.0, "loss": 0.3394, "rewards/accuracies": 0.765625, "rewards/chosen": 0.775390625, "rewards/margins": 2.19921875, "rewards/rejected": -1.421875, "step": 457 }, { "epoch": 0.5066371681415929, "grad_norm": 13.852214813232422, "learning_rate": 2.5806775823612076e-07, "logits/chosen": -1.34375, "logits/rejected": -1.203125, "logps/chosen": -244.0, "logps/rejected": -284.0, "loss": 0.3206, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.888671875, "rewards/margins": 2.5078125, "rewards/rejected": -1.62109375, "step": 458 }, { "epoch": 0.5077433628318584, "grad_norm": 13.243755340576172, "learning_rate": 2.5717160200551213e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.0546875, "logps/chosen": -242.0, "logps/rejected": -268.0, "loss": 0.3353, "rewards/accuracies": 0.796875, "rewards/chosen": 0.626953125, "rewards/margins": 2.453125, "rewards/rejected": -1.828125, "step": 459 }, { "epoch": 0.5088495575221239, "grad_norm": 13.583708763122559, "learning_rate": 2.562753535374621e-07, "logits/chosen": -1.25, "logits/rejected": -1.1015625, "logps/chosen": -244.5, "logps/rejected": -266.5, "loss": 0.3068, "rewards/accuracies": 0.84375, "rewards/chosen": 0.912109375, "rewards/margins": 2.515625, "rewards/rejected": -1.6015625, "step": 460 }, { "epoch": 0.5099557522123894, "grad_norm": 14.627636909484863, "learning_rate": 2.553790243590556e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.21875, "logps/chosen": -230.0, "logps/rejected": -265.0, "loss": 0.3564, "rewards/accuracies": 0.78125, "rewards/chosen": 0.998046875, "rewards/margins": 2.3984375, "rewards/rejected": -1.40234375, "step": 461 }, { "epoch": 0.5110619469026548, "grad_norm": 16.162841796875, "learning_rate": 2.5448262599841556e-07, "logits/chosen": -1.3984375, "logits/rejected": -1.23046875, "logps/chosen": -256.5, "logps/rejected": -280.0, "loss": 0.3297, "rewards/accuracies": 0.84375, "rewards/chosen": 0.83984375, "rewards/margins": 2.453125, "rewards/rejected": -1.6171875, "step": 462 }, { "epoch": 0.5121681415929203, "grad_norm": 13.07744026184082, "learning_rate": 2.535861699845549e-07, "logits/chosen": -1.203125, "logits/rejected": -1.1796875, "logps/chosen": -244.5, "logps/rejected": -279.0, "loss": 0.3268, "rewards/accuracies": 0.8125, "rewards/chosen": 0.818359375, "rewards/margins": 2.5, "rewards/rejected": -1.68359375, "step": 463 }, { "epoch": 0.5132743362831859, "grad_norm": 13.961365699768066, "learning_rate": 2.526896678472279e-07, "logits/chosen": -1.265625, "logits/rejected": -1.12109375, "logps/chosen": -267.0, "logps/rejected": -273.0, "loss": 0.3112, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.732421875, "rewards/margins": 2.671875, "rewards/rejected": -1.94140625, "step": 464 }, { "epoch": 0.5143805309734514, "grad_norm": 12.809479713439941, "learning_rate": 2.51793131116782e-07, "logits/chosen": -1.328125, "logits/rejected": -1.15625, "logps/chosen": -223.0, "logps/rejected": -244.5, "loss": 0.3162, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.744140625, "rewards/margins": 2.7109375, "rewards/rejected": -1.9765625, "step": 465 }, { "epoch": 0.5154867256637168, "grad_norm": 12.547861099243164, "learning_rate": 2.5089657132400964e-07, "logits/chosen": -1.14453125, "logits/rejected": -1.08984375, "logps/chosen": -261.0, "logps/rejected": -273.0, "loss": 0.2895, "rewards/accuracies": 0.84375, "rewards/chosen": 0.85546875, "rewards/margins": 2.6796875, "rewards/rejected": -1.828125, "step": 466 }, { "epoch": 0.5165929203539823, "grad_norm": 13.852119445800781, "learning_rate": 2.5e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.18359375, "logps/chosen": -256.5, "logps/rejected": -294.0, "loss": 0.3104, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.8046875, "rewards/margins": 2.6875, "rewards/rejected": -1.8828125, "step": 467 }, { "epoch": 0.5176991150442478, "grad_norm": 12.272007942199707, "learning_rate": 2.491034286759903e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.14453125, "logps/chosen": -254.0, "logps/rejected": -284.0, "loss": 0.2965, "rewards/accuracies": 0.890625, "rewards/chosen": 0.87109375, "rewards/margins": 2.75, "rewards/rejected": -1.87109375, "step": 468 }, { "epoch": 0.5188053097345132, "grad_norm": 13.289767265319824, "learning_rate": 2.482068688832181e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.13671875, "logps/chosen": -236.5, "logps/rejected": -258.0, "loss": 0.3045, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.90234375, "rewards/margins": 2.734375, "rewards/rejected": -1.82421875, "step": 469 }, { "epoch": 0.5199115044247787, "grad_norm": 13.52807903289795, "learning_rate": 2.4731033215277213e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.17578125, "logps/chosen": -251.5, "logps/rejected": -285.0, "loss": 0.3189, "rewards/accuracies": 0.828125, "rewards/chosen": 0.828125, "rewards/margins": 2.640625, "rewards/rejected": -1.8203125, "step": 470 }, { "epoch": 0.5210176991150443, "grad_norm": 14.39229679107666, "learning_rate": 2.464138300154451e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.1640625, "logps/chosen": -254.5, "logps/rejected": -280.0, "loss": 0.3246, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.787109375, "rewards/margins": 2.6015625, "rewards/rejected": -1.81640625, "step": 471 }, { "epoch": 0.5221238938053098, "grad_norm": 14.51116943359375, "learning_rate": 2.455173740015845e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.12109375, "logps/chosen": -246.0, "logps/rejected": -269.5, "loss": 0.3957, "rewards/accuracies": 0.75, "rewards/chosen": 0.65234375, "rewards/margins": 2.35546875, "rewards/rejected": -1.70703125, "step": 472 }, { "epoch": 0.5232300884955752, "grad_norm": 14.468950271606445, "learning_rate": 2.4462097564094445e-07, "logits/chosen": -1.3125, "logits/rejected": -1.2421875, "logps/chosen": -250.0, "logps/rejected": -296.0, "loss": 0.3396, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.703125, "rewards/margins": 2.546875, "rewards/rejected": -1.83984375, "step": 473 }, { "epoch": 0.5243362831858407, "grad_norm": 12.03284740447998, "learning_rate": 2.4372464646253794e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.09765625, "logps/chosen": -255.0, "logps/rejected": -277.0, "loss": 0.266, "rewards/accuracies": 0.859375, "rewards/chosen": 1.017578125, "rewards/margins": 2.96875, "rewards/rejected": -1.9453125, "step": 474 }, { "epoch": 0.5254424778761062, "grad_norm": 19.379676818847656, "learning_rate": 2.4282839799448785e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.17578125, "logps/chosen": -277.0, "logps/rejected": -316.0, "loss": 0.3512, "rewards/accuracies": 0.796875, "rewards/chosen": 0.763671875, "rewards/margins": 2.7578125, "rewards/rejected": -1.99609375, "step": 475 }, { "epoch": 0.5265486725663717, "grad_norm": 13.170727729797363, "learning_rate": 2.419322417638792e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.1171875, "logps/chosen": -255.5, "logps/rejected": -274.0, "loss": 0.3296, "rewards/accuracies": 0.796875, "rewards/chosen": 0.4990234375, "rewards/margins": 2.421875, "rewards/rejected": -1.9296875, "step": 476 }, { "epoch": 0.5276548672566371, "grad_norm": 10.943991661071777, "learning_rate": 2.410361892966107e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.11328125, "logps/chosen": -223.5, "logps/rejected": -244.0, "loss": 0.2629, "rewards/accuracies": 0.84375, "rewards/chosen": 0.806640625, "rewards/margins": 3.1171875, "rewards/rejected": -2.3125, "step": 477 }, { "epoch": 0.5287610619469026, "grad_norm": 14.212424278259277, "learning_rate": 2.401402521172463e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.2109375, "logps/chosen": -249.5, "logps/rejected": -274.0, "loss": 0.3509, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65625, "rewards/margins": 2.4140625, "rewards/rejected": -1.75, "step": 478 }, { "epoch": 0.5298672566371682, "grad_norm": 11.761503219604492, "learning_rate": 2.392444417488673e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.234375, "logps/chosen": -234.5, "logps/rejected": -278.0, "loss": 0.2504, "rewards/accuracies": 0.8671875, "rewards/chosen": 1.00390625, "rewards/margins": 2.953125, "rewards/rejected": -1.953125, "step": 479 }, { "epoch": 0.5309734513274337, "grad_norm": 15.108154296875, "learning_rate": 2.3834876971292433e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.2265625, "logps/chosen": -285.0, "logps/rejected": -303.0, "loss": 0.3124, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.97265625, "rewards/margins": 3.0546875, "rewards/rejected": -2.0859375, "step": 480 }, { "epoch": 0.5320796460176991, "grad_norm": 13.407039642333984, "learning_rate": 2.3745324752908822e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.1796875, "logps/chosen": -253.0, "logps/rejected": -283.0, "loss": 0.2827, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.830078125, "rewards/margins": 2.71875, "rewards/rejected": -1.890625, "step": 481 }, { "epoch": 0.5331858407079646, "grad_norm": 13.764703750610352, "learning_rate": 2.365578867151031e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.1640625, "logps/chosen": -249.0, "logps/rejected": -260.0, "loss": 0.3393, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.607421875, "rewards/margins": 2.8359375, "rewards/rejected": -2.2265625, "step": 482 }, { "epoch": 0.5342920353982301, "grad_norm": 14.601144790649414, "learning_rate": 2.3566269878663714e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.2109375, "logps/chosen": -264.5, "logps/rejected": -291.0, "loss": 0.3486, "rewards/accuracies": 0.8125, "rewards/chosen": 0.587890625, "rewards/margins": 2.3046875, "rewards/rejected": -1.71484375, "step": 483 }, { "epoch": 0.5353982300884956, "grad_norm": 13.12956714630127, "learning_rate": 2.347676952571354e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.16015625, "logps/chosen": -218.0, "logps/rejected": -243.0, "loss": 0.3522, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.751953125, "rewards/margins": 2.7265625, "rewards/rejected": -1.9765625, "step": 484 }, { "epoch": 0.536504424778761, "grad_norm": 13.433536529541016, "learning_rate": 2.3387288763767095e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.0703125, "logps/chosen": -266.0, "logps/rejected": -267.0, "loss": 0.3058, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.6484375, "rewards/margins": 2.7734375, "rewards/rejected": -2.125, "step": 485 }, { "epoch": 0.5376106194690266, "grad_norm": 13.798343658447266, "learning_rate": 2.329782874367973e-07, "logits/chosen": -1.25, "logits/rejected": -1.125, "logps/chosen": -250.5, "logps/rejected": -260.0, "loss": 0.2997, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.568359375, "rewards/margins": 2.703125, "rewards/rejected": -2.125, "step": 486 }, { "epoch": 0.5387168141592921, "grad_norm": 13.606473922729492, "learning_rate": 2.3208390616040025e-07, "logits/chosen": -1.234375, "logits/rejected": -1.20703125, "logps/chosen": -265.5, "logps/rejected": -323.0, "loss": 0.3473, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7421875, "rewards/margins": 2.4609375, "rewards/rejected": -1.71875, "step": 487 }, { "epoch": 0.5398230088495575, "grad_norm": 14.441394805908203, "learning_rate": 2.3118975531155003e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.14453125, "logps/chosen": -257.5, "logps/rejected": -281.0, "loss": 0.3566, "rewards/accuracies": 0.8125, "rewards/chosen": 0.501953125, "rewards/margins": 2.21875, "rewards/rejected": -1.71875, "step": 488 }, { "epoch": 0.540929203539823, "grad_norm": 13.99435043334961, "learning_rate": 2.3029584639035284e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.16796875, "logps/chosen": -251.0, "logps/rejected": -293.0, "loss": 0.3419, "rewards/accuracies": 0.765625, "rewards/chosen": 0.701171875, "rewards/margins": 2.5234375, "rewards/rejected": -1.828125, "step": 489 }, { "epoch": 0.5420353982300885, "grad_norm": 12.558284759521484, "learning_rate": 2.294021908938039e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.05859375, "logps/chosen": -243.0, "logps/rejected": -249.5, "loss": 0.2931, "rewards/accuracies": 0.859375, "rewards/chosen": 0.7470703125, "rewards/margins": 2.9140625, "rewards/rejected": -2.1640625, "step": 490 }, { "epoch": 0.543141592920354, "grad_norm": 13.195667266845703, "learning_rate": 2.285088003156384e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.0625, "logps/chosen": -268.0, "logps/rejected": -308.0, "loss": 0.33, "rewards/accuracies": 0.84375, "rewards/chosen": 0.669921875, "rewards/margins": 2.5234375, "rewards/rejected": -1.8515625, "step": 491 }, { "epoch": 0.5442477876106194, "grad_norm": 13.966536521911621, "learning_rate": 2.2761568614618472e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.3203125, "logps/chosen": -250.5, "logps/rejected": -266.0, "loss": 0.3732, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.763671875, "rewards/margins": 2.3203125, "rewards/rejected": -1.5546875, "step": 492 }, { "epoch": 0.5453539823008849, "grad_norm": 13.840253829956055, "learning_rate": 2.2672285987221625e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.16015625, "logps/chosen": -263.0, "logps/rejected": -279.5, "loss": 0.3326, "rewards/accuracies": 0.8125, "rewards/chosen": 0.62109375, "rewards/margins": 2.4140625, "rewards/rejected": -1.7890625, "step": 493 }, { "epoch": 0.5464601769911505, "grad_norm": 13.471453666687012, "learning_rate": 2.2583033297680315e-07, "logits/chosen": -1.203125, "logits/rejected": -1.06640625, "logps/chosen": -274.0, "logps/rejected": -307.0, "loss": 0.3214, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6025390625, "rewards/margins": 2.6953125, "rewards/rejected": -2.08984375, "step": 494 }, { "epoch": 0.547566371681416, "grad_norm": 13.170902252197266, "learning_rate": 2.2493811693916567e-07, "logits/chosen": -1.3984375, "logits/rejected": -1.13671875, "logps/chosen": -255.5, "logps/rejected": -285.0, "loss": 0.2704, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.822265625, "rewards/margins": 2.7421875, "rewards/rejected": -1.91796875, "step": 495 }, { "epoch": 0.5486725663716814, "grad_norm": 14.177343368530273, "learning_rate": 2.2404622323452562e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.171875, "logps/chosen": -238.0, "logps/rejected": -284.0, "loss": 0.3352, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.916015625, "rewards/margins": 2.515625, "rewards/rejected": -1.59765625, "step": 496 }, { "epoch": 0.5497787610619469, "grad_norm": 10.760467529296875, "learning_rate": 2.2315466333395924e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.19921875, "logps/chosen": -222.5, "logps/rejected": -285.0, "loss": 0.2274, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.1875, "rewards/margins": 3.3515625, "rewards/rejected": -2.1640625, "step": 497 }, { "epoch": 0.5508849557522124, "grad_norm": 15.023584365844727, "learning_rate": 2.222634487042496e-07, "logits/chosen": -1.296875, "logits/rejected": -1.2109375, "logps/chosen": -256.5, "logps/rejected": -279.5, "loss": 0.3246, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.76953125, "rewards/margins": 2.515625, "rewards/rejected": -1.75, "step": 498 }, { "epoch": 0.5519911504424779, "grad_norm": 12.802349090576172, "learning_rate": 2.2137259080773896e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.21875, "logps/chosen": -246.5, "logps/rejected": -256.0, "loss": 0.2918, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.83203125, "rewards/margins": 2.8203125, "rewards/rejected": -1.98828125, "step": 499 }, { "epoch": 0.5530973451327433, "grad_norm": 13.720869064331055, "learning_rate": 2.204821011021815e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.09765625, "logps/chosen": -242.0, "logps/rejected": -278.5, "loss": 0.3394, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.6025390625, "rewards/margins": 2.28125, "rewards/rejected": -1.6796875, "step": 500 }, { "epoch": 0.5530973451327433, "eval_logits/chosen": -1.2517879009246826, "eval_logits/rejected": -1.1513915061950684, "eval_logps/chosen": -252.49253845214844, "eval_logps/rejected": -277.5074768066406, "eval_loss": 0.3277411162853241, "eval_rewards/accuracies": 0.8062752485275269, "eval_rewards/chosen": 0.83104008436203, "eval_rewards/margins": 2.661613702774048, "eval_rewards/rejected": -1.8305736780166626, "eval_runtime": 193.0734, "eval_samples_per_second": 66.571, "eval_steps_per_second": 1.041, "step": 500 }, { "epoch": 0.5542035398230089, "grad_norm": 13.983589172363281, "learning_rate": 2.195919910405961e-07, "logits/chosen": -1.1875, "logits/rejected": -1.0546875, "logps/chosen": -243.5, "logps/rejected": -268.0, "loss": 0.3415, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.810546875, "rewards/margins": 2.7109375, "rewards/rejected": -1.89453125, "step": 501 }, { "epoch": 0.5553097345132744, "grad_norm": 13.887030601501465, "learning_rate": 2.1870227207111853e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.0703125, "logps/chosen": -271.0, "logps/rejected": -282.0, "loss": 0.3074, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.68359375, "rewards/margins": 2.65625, "rewards/rejected": -1.9765625, "step": 502 }, { "epoch": 0.5564159292035398, "grad_norm": 13.680130004882812, "learning_rate": 2.1781295563685476e-07, "logits/chosen": -1.15625, "logits/rejected": -1.009765625, "logps/chosen": -280.0, "logps/rejected": -288.0, "loss": 0.3024, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8046875, "rewards/margins": 2.8828125, "rewards/rejected": -2.0859375, "step": 503 }, { "epoch": 0.5575221238938053, "grad_norm": 14.351888656616211, "learning_rate": 2.1692405317573366e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.16796875, "logps/chosen": -257.0, "logps/rejected": -259.5, "loss": 0.3655, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.689453125, "rewards/margins": 2.515625, "rewards/rejected": -1.82421875, "step": 504 }, { "epoch": 0.5586283185840708, "grad_norm": 13.825343132019043, "learning_rate": 2.1603557612035932e-07, "logits/chosen": -1.1796875, "logits/rejected": -1.1875, "logps/chosen": -274.0, "logps/rejected": -302.0, "loss": 0.2957, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.046875, "rewards/margins": 2.78125, "rewards/rejected": -1.73828125, "step": 505 }, { "epoch": 0.5597345132743363, "grad_norm": 15.617433547973633, "learning_rate": 2.1514753589786516e-07, "logits/chosen": -1.25, "logits/rejected": -1.1484375, "logps/chosen": -257.5, "logps/rejected": -283.0, "loss": 0.3659, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.775390625, "rewards/margins": 2.359375, "rewards/rejected": -1.578125, "step": 506 }, { "epoch": 0.5608407079646017, "grad_norm": 12.907238960266113, "learning_rate": 2.1425994392976559e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.1875, "logps/chosen": -253.5, "logps/rejected": -284.0, "loss": 0.3268, "rewards/accuracies": 0.796875, "rewards/chosen": 0.8828125, "rewards/margins": 2.875, "rewards/rejected": -1.98828125, "step": 507 }, { "epoch": 0.5619469026548672, "grad_norm": 12.990286827087402, "learning_rate": 2.1337281163181034e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.16015625, "logps/chosen": -280.0, "logps/rejected": -278.0, "loss": 0.2744, "rewards/accuracies": 0.859375, "rewards/chosen": 0.853515625, "rewards/margins": 2.7265625, "rewards/rejected": -1.875, "step": 508 }, { "epoch": 0.5630530973451328, "grad_norm": 10.47897720336914, "learning_rate": 2.1248615041383682e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.1328125, "logps/chosen": -234.5, "logps/rejected": -279.5, "loss": 0.2341, "rewards/accuracies": 0.875, "rewards/chosen": 0.822265625, "rewards/margins": 3.046875, "rewards/rejected": -2.21875, "step": 509 }, { "epoch": 0.5641592920353983, "grad_norm": 12.620576858520508, "learning_rate": 2.1159997167962378e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.078125, "logps/chosen": -229.5, "logps/rejected": -265.0, "loss": 0.3217, "rewards/accuracies": 0.796875, "rewards/chosen": 0.779296875, "rewards/margins": 2.5625, "rewards/rejected": -1.78515625, "step": 510 }, { "epoch": 0.5652654867256637, "grad_norm": 14.882222175598145, "learning_rate": 2.1071428682674436e-07, "logits/chosen": -1.234375, "logits/rejected": -1.140625, "logps/chosen": -258.0, "logps/rejected": -293.0, "loss": 0.3461, "rewards/accuracies": 0.7734375, "rewards/chosen": 1.03125, "rewards/margins": 2.65625, "rewards/rejected": -1.625, "step": 511 }, { "epoch": 0.5663716814159292, "grad_norm": 14.687088966369629, "learning_rate": 2.098291072464199e-07, "logits/chosen": -1.234375, "logits/rejected": -1.21875, "logps/chosen": -250.5, "logps/rejected": -307.0, "loss": 0.3347, "rewards/accuracies": 0.734375, "rewards/chosen": 1.025390625, "rewards/margins": 2.8203125, "rewards/rejected": -1.796875, "step": 512 }, { "epoch": 0.5674778761061947, "grad_norm": 13.261859893798828, "learning_rate": 2.0894444432337282e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.18359375, "logps/chosen": -252.5, "logps/rejected": -263.0, "loss": 0.2804, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.830078125, "rewards/margins": 2.796875, "rewards/rejected": -1.96875, "step": 513 }, { "epoch": 0.5685840707964602, "grad_norm": 15.263360977172852, "learning_rate": 2.08060309435681e-07, "logits/chosen": -1.359375, "logits/rejected": -1.21484375, "logps/chosen": -267.0, "logps/rejected": -302.0, "loss": 0.3238, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.931640625, "rewards/margins": 2.7734375, "rewards/rejected": -1.84375, "step": 514 }, { "epoch": 0.5696902654867256, "grad_norm": 13.30583667755127, "learning_rate": 2.071767139546306e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.09375, "logps/chosen": -253.5, "logps/rejected": -301.0, "loss": 0.3201, "rewards/accuracies": 0.78125, "rewards/chosen": 0.810546875, "rewards/margins": 2.609375, "rewards/rejected": -1.796875, "step": 515 }, { "epoch": 0.5707964601769911, "grad_norm": 13.454291343688965, "learning_rate": 2.062936692445705e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.125, "logps/chosen": -244.0, "logps/rejected": -285.0, "loss": 0.3014, "rewards/accuracies": 0.828125, "rewards/chosen": 0.8046875, "rewards/margins": 2.6953125, "rewards/rejected": -1.8984375, "step": 516 }, { "epoch": 0.5719026548672567, "grad_norm": 14.328228950500488, "learning_rate": 2.0541118666276577e-07, "logits/chosen": -1.25, "logits/rejected": -1.17578125, "logps/chosen": -261.0, "logps/rejected": -314.0, "loss": 0.331, "rewards/accuracies": 0.78125, "rewards/chosen": 0.90625, "rewards/margins": 2.609375, "rewards/rejected": -1.70703125, "step": 517 }, { "epoch": 0.5730088495575221, "grad_norm": 15.810956001281738, "learning_rate": 2.045292775592515e-07, "logits/chosen": -1.25, "logits/rejected": -1.1875, "logps/chosen": -258.5, "logps/rejected": -287.0, "loss": 0.3654, "rewards/accuracies": 0.78125, "rewards/chosen": 0.828125, "rewards/margins": 2.765625, "rewards/rejected": -1.9375, "step": 518 }, { "epoch": 0.5741150442477876, "grad_norm": 14.267369270324707, "learning_rate": 2.0364795327668722e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.078125, "logps/chosen": -290.0, "logps/rejected": -282.0, "loss": 0.3298, "rewards/accuracies": 0.796875, "rewards/chosen": 0.5419921875, "rewards/margins": 2.453125, "rewards/rejected": -1.9140625, "step": 519 }, { "epoch": 0.5752212389380531, "grad_norm": 17.044023513793945, "learning_rate": 2.0276722515021084e-07, "logits/chosen": -1.3125, "logits/rejected": -1.15625, "logps/chosen": -254.5, "logps/rejected": -288.0, "loss": 0.4207, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.66015625, "rewards/margins": 2.12890625, "rewards/rejected": -1.46875, "step": 520 }, { "epoch": 0.5763274336283186, "grad_norm": 14.049108505249023, "learning_rate": 2.0188710450729253e-07, "logits/chosen": -1.19140625, "logits/rejected": -1.21875, "logps/chosen": -235.5, "logps/rejected": -283.0, "loss": 0.3075, "rewards/accuracies": 0.8125, "rewards/chosen": 1.232421875, "rewards/margins": 3.1640625, "rewards/rejected": -1.9375, "step": 521 }, { "epoch": 0.577433628318584, "grad_norm": 14.81714153289795, "learning_rate": 2.0100760266758953e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.08984375, "logps/chosen": -253.5, "logps/rejected": -244.5, "loss": 0.3601, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3740234375, "rewards/margins": 2.3671875, "rewards/rejected": -2.0, "step": 522 }, { "epoch": 0.5785398230088495, "grad_norm": 12.647814750671387, "learning_rate": 2.0012873094280032e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.140625, "logps/chosen": -254.5, "logps/rejected": -297.0, "loss": 0.2831, "rewards/accuracies": 0.875, "rewards/chosen": 1.060546875, "rewards/margins": 3.125, "rewards/rejected": -2.0625, "step": 523 }, { "epoch": 0.5796460176991151, "grad_norm": 13.28409481048584, "learning_rate": 1.992505006365191e-07, "logits/chosen": -1.1484375, "logits/rejected": -1.11328125, "logps/chosen": -268.0, "logps/rejected": -295.0, "loss": 0.2986, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.8828125, "rewards/margins": 2.859375, "rewards/rejected": -1.96875, "step": 524 }, { "epoch": 0.5807522123893806, "grad_norm": 12.828714370727539, "learning_rate": 1.983729230440907e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.26171875, "logps/chosen": -241.5, "logps/rejected": -292.0, "loss": 0.2867, "rewards/accuracies": 0.828125, "rewards/chosen": 1.021484375, "rewards/margins": 3.0703125, "rewards/rejected": -2.0546875, "step": 525 }, { "epoch": 0.581858407079646, "grad_norm": 13.270194053649902, "learning_rate": 1.974960094524647e-07, "logits/chosen": -1.1875, "logits/rejected": -1.13671875, "logps/chosen": -252.5, "logps/rejected": -284.0, "loss": 0.3036, "rewards/accuracies": 0.8125, "rewards/chosen": 0.837890625, "rewards/margins": 2.71875, "rewards/rejected": -1.87890625, "step": 526 }, { "epoch": 0.5829646017699115, "grad_norm": 14.76196575164795, "learning_rate": 1.9661977114005095e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.09375, "logps/chosen": -266.0, "logps/rejected": -282.0, "loss": 0.3643, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.83984375, "rewards/margins": 2.65625, "rewards/rejected": -1.81640625, "step": 527 }, { "epoch": 0.584070796460177, "grad_norm": 13.863215446472168, "learning_rate": 1.9574421937657423e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.171875, "logps/chosen": -261.0, "logps/rejected": -295.0, "loss": 0.2729, "rewards/accuracies": 0.84375, "rewards/chosen": 1.02734375, "rewards/margins": 3.078125, "rewards/rejected": -2.0546875, "step": 528 }, { "epoch": 0.5851769911504425, "grad_norm": 17.09484100341797, "learning_rate": 1.9486936542292897e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.1484375, "logps/chosen": -281.0, "logps/rejected": -284.0, "loss": 0.4144, "rewards/accuracies": 0.75, "rewards/chosen": 0.681640625, "rewards/margins": 2.203125, "rewards/rejected": -1.5234375, "step": 529 }, { "epoch": 0.5862831858407079, "grad_norm": 13.073324203491211, "learning_rate": 1.9399522053103512e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.16015625, "logps/chosen": -256.5, "logps/rejected": -272.5, "loss": 0.3109, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.90234375, "rewards/margins": 2.734375, "rewards/rejected": -1.83984375, "step": 530 }, { "epoch": 0.5873893805309734, "grad_norm": 16.40873908996582, "learning_rate": 1.9312179594369267e-07, "logits/chosen": -1.171875, "logits/rejected": -1.1015625, "logps/chosen": -270.0, "logps/rejected": -298.0, "loss": 0.3547, "rewards/accuracies": 0.765625, "rewards/chosen": 0.759765625, "rewards/margins": 2.5078125, "rewards/rejected": -1.75, "step": 531 }, { "epoch": 0.588495575221239, "grad_norm": 13.864864349365234, "learning_rate": 1.9224910289443766e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.13671875, "logps/chosen": -233.5, "logps/rejected": -259.0, "loss": 0.367, "rewards/accuracies": 0.78125, "rewards/chosen": 0.880859375, "rewards/margins": 2.4765625, "rewards/rejected": -1.6015625, "step": 532 }, { "epoch": 0.5896017699115044, "grad_norm": 11.59555721282959, "learning_rate": 1.913771526073976e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.12890625, "logps/chosen": -254.0, "logps/rejected": -295.0, "loss": 0.2546, "rewards/accuracies": 0.875, "rewards/chosen": 1.154296875, "rewards/margins": 2.9140625, "rewards/rejected": -1.75390625, "step": 533 }, { "epoch": 0.5907079646017699, "grad_norm": 559.9500122070312, "learning_rate": 1.9050595629714654e-07, "logits/chosen": -1.2265625, "logits/rejected": -0.99609375, "logps/chosen": -270.0, "logps/rejected": -344.0, "loss": 0.3657, "rewards/accuracies": 0.78125, "rewards/chosen": 0.978515625, "rewards/margins": 2.65625, "rewards/rejected": -1.671875, "step": 534 }, { "epoch": 0.5918141592920354, "grad_norm": 12.260162353515625, "learning_rate": 1.8963552516856158e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.16015625, "logps/chosen": -242.0, "logps/rejected": -265.5, "loss": 0.2995, "rewards/accuracies": 0.828125, "rewards/chosen": 1.09375, "rewards/margins": 3.015625, "rewards/rejected": -1.921875, "step": 535 }, { "epoch": 0.5929203539823009, "grad_norm": 15.339058876037598, "learning_rate": 1.8876587041667852e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.20703125, "logps/chosen": -241.0, "logps/rejected": -265.0, "loss": 0.3577, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.865234375, "rewards/margins": 2.3984375, "rewards/rejected": -1.52734375, "step": 536 }, { "epoch": 0.5940265486725663, "grad_norm": 16.724506378173828, "learning_rate": 1.8789700322654747e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.046875, "logps/chosen": -251.0, "logps/rejected": -289.0, "loss": 0.2921, "rewards/accuracies": 0.859375, "rewards/chosen": 0.802734375, "rewards/margins": 2.90625, "rewards/rejected": -2.1015625, "step": 537 }, { "epoch": 0.5951327433628318, "grad_norm": 14.673587799072266, "learning_rate": 1.8702893477308972e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.078125, "logps/chosen": -253.0, "logps/rejected": -261.0, "loss": 0.3511, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.869140625, "rewards/margins": 2.671875, "rewards/rejected": -1.8046875, "step": 538 }, { "epoch": 0.5962389380530974, "grad_norm": 15.79550552368164, "learning_rate": 1.8616167622095324e-07, "logits/chosen": -1.203125, "logits/rejected": -1.12109375, "logps/chosen": -267.0, "logps/rejected": -313.0, "loss": 0.3384, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.021484375, "rewards/margins": 2.671875, "rewards/rejected": -1.6484375, "step": 539 }, { "epoch": 0.5973451327433629, "grad_norm": 14.441572189331055, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -1.37890625, "logits/rejected": -1.1640625, "logps/chosen": -249.5, "logps/rejected": -272.0, "loss": 0.3185, "rewards/accuracies": 0.8125, "rewards/chosen": 0.921875, "rewards/margins": 2.671875, "rewards/rejected": -1.75, "step": 540 }, { "epoch": 0.5984513274336283, "grad_norm": 13.769876480102539, "learning_rate": 1.8442963342701105e-07, "logits/chosen": -1.14453125, "logits/rejected": -1.1875, "logps/chosen": -277.0, "logps/rejected": -275.0, "loss": 0.2794, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.994140625, "rewards/margins": 2.9375, "rewards/rejected": -1.9453125, "step": 541 }, { "epoch": 0.5995575221238938, "grad_norm": 14.039227485656738, "learning_rate": 1.8356487146184516e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.20703125, "logps/chosen": -234.0, "logps/rejected": -252.5, "loss": 0.3448, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.119140625, "rewards/margins": 2.78125, "rewards/rejected": -1.6640625, "step": 542 }, { "epoch": 0.6006637168141593, "grad_norm": 12.930294036865234, "learning_rate": 1.8270096395099403e-07, "logits/chosen": -1.31640625, "logits/rejected": -1.171875, "logps/chosen": -242.5, "logps/rejected": -273.0, "loss": 0.298, "rewards/accuracies": 0.859375, "rewards/chosen": 1.0859375, "rewards/margins": 2.703125, "rewards/rejected": -1.6171875, "step": 543 }, { "epoch": 0.6017699115044248, "grad_norm": 13.82011890411377, "learning_rate": 1.8183792200559e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.1640625, "logps/chosen": -256.0, "logps/rejected": -287.0, "loss": 0.3418, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.943359375, "rewards/margins": 2.359375, "rewards/rejected": -1.4140625, "step": 544 }, { "epoch": 0.6028761061946902, "grad_norm": 15.775805473327637, "learning_rate": 1.8097575672563275e-07, "logits/chosen": -1.34375, "logits/rejected": -1.15625, "logps/chosen": -249.0, "logps/rejected": -275.0, "loss": 0.2854, "rewards/accuracies": 0.828125, "rewards/chosen": 1.1875, "rewards/margins": 3.1171875, "rewards/rejected": -1.92578125, "step": 545 }, { "epoch": 0.6039823008849557, "grad_norm": 12.980159759521484, "learning_rate": 1.80114479199847e-07, "logits/chosen": -1.1171875, "logits/rejected": -1.140625, "logps/chosen": -263.0, "logps/rejected": -277.0, "loss": 0.2444, "rewards/accuracies": 0.875, "rewards/chosen": 1.23046875, "rewards/margins": 3.28125, "rewards/rejected": -2.0546875, "step": 546 }, { "epoch": 0.6050884955752213, "grad_norm": 13.192628860473633, "learning_rate": 1.792541005055394e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.22265625, "logps/chosen": -254.0, "logps/rejected": -286.0, "loss": 0.3065, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.138671875, "rewards/margins": 2.8984375, "rewards/rejected": -1.7578125, "step": 547 }, { "epoch": 0.6061946902654868, "grad_norm": 13.52103328704834, "learning_rate": 1.783946317084564e-07, "logits/chosen": -1.328125, "logits/rejected": -1.08984375, "logps/chosen": -253.0, "logps/rejected": -269.0, "loss": 0.2691, "rewards/accuracies": 0.8671875, "rewards/chosen": 1.01953125, "rewards/margins": 2.9765625, "rewards/rejected": -1.953125, "step": 548 }, { "epoch": 0.6073008849557522, "grad_norm": 12.777830123901367, "learning_rate": 1.7753608386264193e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.20703125, "logps/chosen": -225.0, "logps/rejected": -274.0, "loss": 0.3203, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.09765625, "rewards/margins": 2.7578125, "rewards/rejected": -1.66796875, "step": 549 }, { "epoch": 0.6084070796460177, "grad_norm": 13.757856369018555, "learning_rate": 1.7667846801029486e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.13671875, "logps/chosen": -264.0, "logps/rejected": -278.0, "loss": 0.2789, "rewards/accuracies": 0.8671875, "rewards/chosen": 1.126953125, "rewards/margins": 2.640625, "rewards/rejected": -1.515625, "step": 550 }, { "epoch": 0.6084070796460177, "eval_logits/chosen": -1.262554407119751, "eval_logits/rejected": -1.1598647832870483, "eval_logps/chosen": -250.73133850097656, "eval_logps/rejected": -276.39801025390625, "eval_loss": 0.32494351267814636, "eval_rewards/accuracies": 0.8066800236701965, "eval_rewards/chosen": 1.011232852935791, "eval_rewards/margins": 2.7289724349975586, "eval_rewards/rejected": -1.717836618423462, "eval_runtime": 193.0762, "eval_samples_per_second": 66.57, "eval_steps_per_second": 1.041, "step": 550 }, { "epoch": 0.6095132743362832, "grad_norm": 16.458438873291016, "learning_rate": 1.758217951816274e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.15625, "logps/chosen": -289.0, "logps/rejected": -310.0, "loss": 0.3871, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.57421875, "rewards/margins": 2.15625, "rewards/rejected": -1.5859375, "step": 551 }, { "epoch": 0.6106194690265486, "grad_norm": 15.513572692871094, "learning_rate": 1.7496607639472327e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.19921875, "logps/chosen": -242.0, "logps/rejected": -270.0, "loss": 0.33, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.00390625, "rewards/margins": 2.9375, "rewards/rejected": -1.9375, "step": 552 }, { "epoch": 0.6117256637168141, "grad_norm": 15.365250587463379, "learning_rate": 1.7411132265539536e-07, "logits/chosen": -1.203125, "logits/rejected": -1.0625, "logps/chosen": -250.0, "logps/rejected": -297.0, "loss": 0.3456, "rewards/accuracies": 0.796875, "rewards/chosen": 0.931640625, "rewards/margins": 2.546875, "rewards/rejected": -1.6171875, "step": 553 }, { "epoch": 0.6128318584070797, "grad_norm": 14.896991729736328, "learning_rate": 1.7325754495704507e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.1875, "logps/chosen": -267.5, "logps/rejected": -315.0, "loss": 0.3605, "rewards/accuracies": 0.78125, "rewards/chosen": 0.787109375, "rewards/margins": 2.3671875, "rewards/rejected": -1.578125, "step": 554 }, { "epoch": 0.6139380530973452, "grad_norm": 14.317460060119629, "learning_rate": 1.7240475428051997e-07, "logits/chosen": -1.34765625, "logits/rejected": -1.1328125, "logps/chosen": -247.0, "logps/rejected": -268.0, "loss": 0.3123, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.96875, "rewards/margins": 2.75, "rewards/rejected": -1.78515625, "step": 555 }, { "epoch": 0.6150442477876106, "grad_norm": 13.919463157653809, "learning_rate": 1.7155296159397356e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.19140625, "logps/chosen": -261.5, "logps/rejected": -304.0, "loss": 0.3188, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.88671875, "rewards/margins": 2.7734375, "rewards/rejected": -1.890625, "step": 556 }, { "epoch": 0.6161504424778761, "grad_norm": 15.724615097045898, "learning_rate": 1.707021778527235e-07, "logits/chosen": -1.34765625, "logits/rejected": -1.1953125, "logps/chosen": -278.0, "logps/rejected": -298.0, "loss": 0.3411, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0625, "rewards/margins": 2.5234375, "rewards/rejected": -1.4609375, "step": 557 }, { "epoch": 0.6172566371681416, "grad_norm": 11.178709983825684, "learning_rate": 1.6985241399911082e-07, "logits/chosen": -1.41796875, "logits/rejected": -1.21484375, "logps/chosen": -234.5, "logps/rejected": -259.0, "loss": 0.2349, "rewards/accuracies": 0.859375, "rewards/chosen": 0.91015625, "rewards/margins": 3.2265625, "rewards/rejected": -2.3125, "step": 558 }, { "epoch": 0.6183628318584071, "grad_norm": 13.019828796386719, "learning_rate": 1.6900368096235931e-07, "logits/chosen": -1.328125, "logits/rejected": -1.1953125, "logps/chosen": -227.5, "logps/rejected": -288.0, "loss": 0.3063, "rewards/accuracies": 0.8125, "rewards/chosen": 1.208984375, "rewards/margins": 3.265625, "rewards/rejected": -2.05078125, "step": 559 }, { "epoch": 0.6194690265486725, "grad_norm": 15.409528732299805, "learning_rate": 1.6815598965843519e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.15625, "logps/chosen": -266.0, "logps/rejected": -326.0, "loss": 0.2972, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.12109375, "rewards/margins": 3.1875, "rewards/rejected": -2.0703125, "step": 560 }, { "epoch": 0.620575221238938, "grad_norm": 13.341466903686523, "learning_rate": 1.67309350989906e-07, "logits/chosen": -1.37109375, "logits/rejected": -1.09765625, "logps/chosen": -257.0, "logps/rejected": -259.5, "loss": 0.3021, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.78125, "rewards/margins": 2.65625, "rewards/rejected": -1.875, "step": 561 }, { "epoch": 0.6216814159292036, "grad_norm": 13.608122825622559, "learning_rate": 1.664637758458013e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.078125, "logps/chosen": -248.5, "logps/rejected": -238.5, "loss": 0.3346, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4716796875, "rewards/margins": 2.3359375, "rewards/rejected": -1.86328125, "step": 562 }, { "epoch": 0.6227876106194691, "grad_norm": 14.688355445861816, "learning_rate": 1.656192751014717e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.13671875, "logps/chosen": -266.0, "logps/rejected": -298.0, "loss": 0.3522, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.90234375, "rewards/margins": 2.578125, "rewards/rejected": -1.67578125, "step": 563 }, { "epoch": 0.6238938053097345, "grad_norm": 13.295293807983398, "learning_rate": 1.647758596184498e-07, "logits/chosen": -1.1875, "logits/rejected": -1.17578125, "logps/chosen": -262.5, "logps/rejected": -287.0, "loss": 0.3039, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.849609375, "rewards/margins": 2.4921875, "rewards/rejected": -1.64453125, "step": 564 }, { "epoch": 0.625, "grad_norm": 12.697134017944336, "learning_rate": 1.6393354024431e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.05078125, "logps/chosen": -257.5, "logps/rejected": -265.0, "loss": 0.2807, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.87109375, "rewards/margins": 3.015625, "rewards/rejected": -2.14453125, "step": 565 }, { "epoch": 0.6261061946902655, "grad_norm": 23.94145965576172, "learning_rate": 1.63092327812529e-07, "logits/chosen": -1.34375, "logits/rejected": -1.2734375, "logps/chosen": -246.5, "logps/rejected": -232.5, "loss": 0.3647, "rewards/accuracies": 0.8125, "rewards/chosen": 0.576171875, "rewards/margins": 2.421875, "rewards/rejected": -1.84375, "step": 566 }, { "epoch": 0.6272123893805309, "grad_norm": 13.088120460510254, "learning_rate": 1.622522331423467e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.1640625, "logps/chosen": -261.0, "logps/rejected": -307.0, "loss": 0.307, "rewards/accuracies": 0.84375, "rewards/chosen": 0.875, "rewards/margins": 2.8046875, "rewards/rejected": -1.92578125, "step": 567 }, { "epoch": 0.6283185840707964, "grad_norm": 16.868358612060547, "learning_rate": 1.6141326703862706e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.24609375, "logps/chosen": -260.0, "logps/rejected": -292.0, "loss": 0.4, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.720703125, "rewards/margins": 2.5703125, "rewards/rejected": -1.8515625, "step": 568 }, { "epoch": 0.629424778761062, "grad_norm": 13.88227653503418, "learning_rate": 1.605754402917186e-07, "logits/chosen": -1.43359375, "logits/rejected": -1.28125, "logps/chosen": -245.0, "logps/rejected": -266.5, "loss": 0.2863, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.96484375, "rewards/margins": 2.9296875, "rewards/rejected": -1.96875, "step": 569 }, { "epoch": 0.6305309734513275, "grad_norm": 13.3289794921875, "learning_rate": 1.5973876367731651e-07, "logits/chosen": -1.35546875, "logits/rejected": -1.08203125, "logps/chosen": -280.0, "logps/rejected": -306.0, "loss": 0.2719, "rewards/accuracies": 0.84375, "rewards/chosen": 1.14453125, "rewards/margins": 3.078125, "rewards/rejected": -1.93359375, "step": 570 }, { "epoch": 0.6316371681415929, "grad_norm": 13.53636360168457, "learning_rate": 1.5890324795632315e-07, "logits/chosen": -1.3671875, "logits/rejected": -1.28515625, "logps/chosen": -223.0, "logps/rejected": -262.0, "loss": 0.3, "rewards/accuracies": 0.8125, "rewards/chosen": 1.017578125, "rewards/margins": 2.90625, "rewards/rejected": -1.89453125, "step": 571 }, { "epoch": 0.6327433628318584, "grad_norm": 14.343642234802246, "learning_rate": 1.5806890387471023e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.12109375, "logps/chosen": -267.0, "logps/rejected": -291.0, "loss": 0.2824, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9453125, "rewards/margins": 2.984375, "rewards/rejected": -2.04296875, "step": 572 }, { "epoch": 0.6338495575221239, "grad_norm": 13.577574729919434, "learning_rate": 1.5723574216338065e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.17578125, "logps/chosen": -272.0, "logps/rejected": -276.0, "loss": 0.2799, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9296875, "rewards/margins": 2.8125, "rewards/rejected": -1.8828125, "step": 573 }, { "epoch": 0.6349557522123894, "grad_norm": 17.00868797302246, "learning_rate": 1.5640377353802985e-07, "logits/chosen": -1.3125, "logits/rejected": -1.025390625, "logps/chosen": -286.0, "logps/rejected": -286.0, "loss": 0.3574, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.73828125, "rewards/margins": 2.6015625, "rewards/rejected": -1.859375, "step": 574 }, { "epoch": 0.6360619469026548, "grad_norm": 14.934076309204102, "learning_rate": 1.5557300869900874e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.09375, "logps/chosen": -281.5, "logps/rejected": -320.0, "loss": 0.347, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.7509765625, "rewards/margins": 2.47265625, "rewards/rejected": -1.7265625, "step": 575 }, { "epoch": 0.6371681415929203, "grad_norm": 14.064445495605469, "learning_rate": 1.547434583311858e-07, "logits/chosen": -1.234375, "logits/rejected": -1.06640625, "logps/chosen": -262.0, "logps/rejected": -262.0, "loss": 0.374, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.5078125, "rewards/margins": 2.16796875, "rewards/rejected": -1.65625, "step": 576 }, { "epoch": 0.6382743362831859, "grad_norm": 14.291051864624023, "learning_rate": 1.5391513310380923e-07, "logits/chosen": -1.17578125, "logits/rejected": -1.1484375, "logps/chosen": -264.5, "logps/rejected": -322.0, "loss": 0.2885, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.8671875, "rewards/margins": 2.8359375, "rewards/rejected": -1.96484375, "step": 577 }, { "epoch": 0.6393805309734514, "grad_norm": 15.690069198608398, "learning_rate": 1.5308804367037049e-07, "logits/chosen": -1.37890625, "logits/rejected": -1.12109375, "logps/chosen": -271.0, "logps/rejected": -313.0, "loss": 0.3193, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.921875, "rewards/margins": 2.8125, "rewards/rejected": -1.88671875, "step": 578 }, { "epoch": 0.6404867256637168, "grad_norm": 14.940479278564453, "learning_rate": 1.5226220066846662e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.12109375, "logps/chosen": -277.0, "logps/rejected": -313.0, "loss": 0.317, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.912109375, "rewards/margins": 2.7421875, "rewards/rejected": -1.828125, "step": 579 }, { "epoch": 0.6415929203539823, "grad_norm": 14.204512596130371, "learning_rate": 1.5143761471966387e-07, "logits/chosen": -1.25, "logits/rejected": -1.1953125, "logps/chosen": -267.0, "logps/rejected": -296.0, "loss": 0.2923, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.935546875, "rewards/margins": 2.9453125, "rewards/rejected": -2.01953125, "step": 580 }, { "epoch": 0.6426991150442478, "grad_norm": 12.345739364624023, "learning_rate": 1.5061429642936104e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.15625, "logps/chosen": -238.5, "logps/rejected": -271.0, "loss": 0.2898, "rewards/accuracies": 0.828125, "rewards/chosen": 1.078125, "rewards/margins": 3.0390625, "rewards/rejected": -1.9609375, "step": 581 }, { "epoch": 0.6438053097345132, "grad_norm": 14.108118057250977, "learning_rate": 1.497922563866526e-07, "logits/chosen": -1.25390625, "logits/rejected": -1.25390625, "logps/chosen": -225.5, "logps/rejected": -276.0, "loss": 0.3588, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.01171875, "rewards/margins": 2.5078125, "rewards/rejected": -1.4921875, "step": 582 }, { "epoch": 0.6449115044247787, "grad_norm": 15.642598152160645, "learning_rate": 1.4897150516419315e-07, "logits/chosen": -1.33984375, "logits/rejected": -1.09765625, "logps/chosen": -262.5, "logps/rejected": -281.0, "loss": 0.3357, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.962890625, "rewards/margins": 2.671875, "rewards/rejected": -1.70703125, "step": 583 }, { "epoch": 0.6460176991150443, "grad_norm": 13.628485679626465, "learning_rate": 1.481520533180611e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.16796875, "logps/chosen": -245.0, "logps/rejected": -250.0, "loss": 0.2903, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8515625, "rewards/margins": 2.8359375, "rewards/rejected": -1.9921875, "step": 584 }, { "epoch": 0.6471238938053098, "grad_norm": 12.115748405456543, "learning_rate": 1.4733391138762275e-07, "logits/chosen": -1.265625, "logits/rejected": -1.21484375, "logps/chosen": -237.0, "logps/rejected": -255.5, "loss": 0.2511, "rewards/accuracies": 0.828125, "rewards/chosen": 1.16796875, "rewards/margins": 3.484375, "rewards/rejected": -2.3125, "step": 585 }, { "epoch": 0.6482300884955752, "grad_norm": 12.073527336120605, "learning_rate": 1.4651708989539733e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.2265625, "logps/chosen": -255.0, "logps/rejected": -251.5, "loss": 0.27, "rewards/accuracies": 0.84375, "rewards/chosen": 1.025390625, "rewards/margins": 2.8671875, "rewards/rejected": -1.84375, "step": 586 }, { "epoch": 0.6493362831858407, "grad_norm": 15.416234016418457, "learning_rate": 1.4570159934692084e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.234375, "logps/chosen": -264.0, "logps/rejected": -290.0, "loss": 0.4144, "rewards/accuracies": 0.78125, "rewards/chosen": 0.69921875, "rewards/margins": 2.203125, "rewards/rejected": -1.50390625, "step": 587 }, { "epoch": 0.6504424778761062, "grad_norm": 14.182881355285645, "learning_rate": 1.448874502306116e-07, "logits/chosen": -1.21875, "logits/rejected": -1.0859375, "logps/chosen": -262.5, "logps/rejected": -280.0, "loss": 0.3193, "rewards/accuracies": 0.796875, "rewards/chosen": 0.9921875, "rewards/margins": 2.7421875, "rewards/rejected": -1.75390625, "step": 588 }, { "epoch": 0.6515486725663717, "grad_norm": 14.108832359313965, "learning_rate": 1.4407465301763532e-07, "logits/chosen": -1.37890625, "logits/rejected": -1.21875, "logps/chosen": -249.0, "logps/rejected": -257.5, "loss": 0.355, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.75, "rewards/margins": 2.734375, "rewards/rejected": -1.98046875, "step": 589 }, { "epoch": 0.6526548672566371, "grad_norm": 14.618428230285645, "learning_rate": 1.432632181617698e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.02734375, "logps/chosen": -243.0, "logps/rejected": -281.0, "loss": 0.313, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0078125, "rewards/margins": 2.9296875, "rewards/rejected": -1.92578125, "step": 590 }, { "epoch": 0.6537610619469026, "grad_norm": 15.182010650634766, "learning_rate": 1.4245315609927112e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.14453125, "logps/chosen": -262.0, "logps/rejected": -269.0, "loss": 0.3443, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.732421875, "rewards/margins": 2.6484375, "rewards/rejected": -1.91015625, "step": 591 }, { "epoch": 0.6548672566371682, "grad_norm": 14.359109878540039, "learning_rate": 1.4164447724873933e-07, "logits/chosen": -1.19140625, "logits/rejected": -1.16015625, "logps/chosen": -253.5, "logps/rejected": -288.0, "loss": 0.3191, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.93359375, "rewards/margins": 2.6171875, "rewards/rejected": -1.6875, "step": 592 }, { "epoch": 0.6559734513274337, "grad_norm": 14.253448486328125, "learning_rate": 1.4083719201098402e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.2421875, "logps/chosen": -251.5, "logps/rejected": -288.0, "loss": 0.3304, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.8203125, "rewards/margins": 2.4765625, "rewards/rejected": -1.65625, "step": 593 }, { "epoch": 0.6570796460176991, "grad_norm": 14.521297454833984, "learning_rate": 1.400313107688912e-07, "logits/chosen": -1.37109375, "logits/rejected": -1.19140625, "logps/chosen": -250.0, "logps/rejected": -260.0, "loss": 0.3297, "rewards/accuracies": 0.8125, "rewards/chosen": 0.982421875, "rewards/margins": 2.859375, "rewards/rejected": -1.875, "step": 594 }, { "epoch": 0.6581858407079646, "grad_norm": 13.660658836364746, "learning_rate": 1.39226843887289e-07, "logits/chosen": -1.26171875, "logits/rejected": -1.19140625, "logps/chosen": -235.0, "logps/rejected": -296.0, "loss": 0.3347, "rewards/accuracies": 0.796875, "rewards/chosen": 0.96875, "rewards/margins": 2.9609375, "rewards/rejected": -1.9921875, "step": 595 }, { "epoch": 0.6592920353982301, "grad_norm": 15.24756908416748, "learning_rate": 1.384238017128152e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.12109375, "logps/chosen": -241.5, "logps/rejected": -274.0, "loss": 0.3958, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.884765625, "rewards/margins": 2.6875, "rewards/rejected": -1.80859375, "step": 596 }, { "epoch": 0.6603982300884956, "grad_norm": 14.213970184326172, "learning_rate": 1.3762219457378354e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.12109375, "logps/chosen": -240.0, "logps/rejected": -288.0, "loss": 0.2724, "rewards/accuracies": 0.828125, "rewards/chosen": 1.1484375, "rewards/margins": 2.84375, "rewards/rejected": -1.6953125, "step": 597 }, { "epoch": 0.661504424778761, "grad_norm": 13.295817375183105, "learning_rate": 1.3682203278005095e-07, "logits/chosen": -1.203125, "logits/rejected": -1.11328125, "logps/chosen": -267.0, "logps/rejected": -287.0, "loss": 0.2403, "rewards/accuracies": 0.890625, "rewards/chosen": 1.296875, "rewards/margins": 3.078125, "rewards/rejected": -1.77734375, "step": 598 }, { "epoch": 0.6626106194690266, "grad_norm": 13.04489517211914, "learning_rate": 1.3602332662288534e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.078125, "logps/chosen": -262.0, "logps/rejected": -269.0, "loss": 0.2891, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.86328125, "rewards/margins": 2.71875, "rewards/rejected": -1.859375, "step": 599 }, { "epoch": 0.6637168141592921, "grad_norm": 15.907817840576172, "learning_rate": 1.3522608637483266e-07, "logits/chosen": -1.359375, "logits/rejected": -1.203125, "logps/chosen": -241.5, "logps/rejected": -275.0, "loss": 0.3724, "rewards/accuracies": 0.734375, "rewards/chosen": 0.548828125, "rewards/margins": 2.2421875, "rewards/rejected": -1.6953125, "step": 600 }, { "epoch": 0.6637168141592921, "eval_logits/chosen": -1.2614272832870483, "eval_logits/rejected": -1.1564831733703613, "eval_logps/chosen": -251.43780517578125, "eval_logps/rejected": -277.5970153808594, "eval_loss": 0.3234591782093048, "eval_rewards/accuracies": 0.809928834438324, "eval_rewards/chosen": 0.9355371594429016, "eval_rewards/margins": 2.773709535598755, "eval_rewards/rejected": -1.8374922275543213, "eval_runtime": 193.0898, "eval_samples_per_second": 66.565, "eval_steps_per_second": 1.041, "step": 600 }, { "epoch": 0.6648230088495575, "grad_norm": 13.516127586364746, "learning_rate": 1.3443032228958545e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.1015625, "logps/chosen": -252.0, "logps/rejected": -284.0, "loss": 0.3214, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.916015625, "rewards/margins": 2.734375, "rewards/rejected": -1.8203125, "step": 601 }, { "epoch": 0.665929203539823, "grad_norm": 13.290761947631836, "learning_rate": 1.336360446018503e-07, "logits/chosen": -1.37890625, "logits/rejected": -1.1953125, "logps/chosen": -240.5, "logps/rejected": -248.5, "loss": 0.3253, "rewards/accuracies": 0.8046875, "rewards/chosen": 1.048828125, "rewards/margins": 2.65625, "rewards/rejected": -1.60546875, "step": 602 }, { "epoch": 0.6670353982300885, "grad_norm": 13.730428695678711, "learning_rate": 1.3284326352721675e-07, "logits/chosen": -1.24609375, "logits/rejected": -1.15625, "logps/chosen": -237.0, "logps/rejected": -265.0, "loss": 0.3161, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.96875, "rewards/margins": 3.171875, "rewards/rejected": -2.1953125, "step": 603 }, { "epoch": 0.668141592920354, "grad_norm": 13.699116706848145, "learning_rate": 1.3205198926202544e-07, "logits/chosen": -1.25, "logits/rejected": -1.125, "logps/chosen": -254.5, "logps/rejected": -295.0, "loss": 0.3262, "rewards/accuracies": 0.796875, "rewards/chosen": 1.0390625, "rewards/margins": 2.7265625, "rewards/rejected": -1.6875, "step": 604 }, { "epoch": 0.6692477876106194, "grad_norm": 15.250642776489258, "learning_rate": 1.312622319832375e-07, "logits/chosen": -1.234375, "logits/rejected": -1.13671875, "logps/chosen": -262.0, "logps/rejected": -275.0, "loss": 0.3704, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.822265625, "rewards/margins": 2.515625, "rewards/rejected": -1.69140625, "step": 605 }, { "epoch": 0.6703539823008849, "grad_norm": 14.902206420898438, "learning_rate": 1.3047400184830303e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.064453125, "logps/chosen": -248.5, "logps/rejected": -277.0, "loss": 0.3634, "rewards/accuracies": 0.796875, "rewards/chosen": 0.76953125, "rewards/margins": 2.3984375, "rewards/rejected": -1.625, "step": 606 }, { "epoch": 0.6714601769911505, "grad_norm": 15.015229225158691, "learning_rate": 1.2968730899503106e-07, "logits/chosen": -1.39453125, "logits/rejected": -1.265625, "logps/chosen": -254.5, "logps/rejected": -271.0, "loss": 0.3279, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.943359375, "rewards/margins": 2.625, "rewards/rejected": -1.6796875, "step": 607 }, { "epoch": 0.672566371681416, "grad_norm": 14.369139671325684, "learning_rate": 1.2890216354145888e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.1796875, "logps/chosen": -241.0, "logps/rejected": -251.0, "loss": 0.3464, "rewards/accuracies": 0.765625, "rewards/chosen": 0.953125, "rewards/margins": 2.625, "rewards/rejected": -1.671875, "step": 608 }, { "epoch": 0.6736725663716814, "grad_norm": 13.686366081237793, "learning_rate": 1.2811857558572167e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.056640625, "logps/chosen": -259.0, "logps/rejected": -262.0, "loss": 0.3501, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.763671875, "rewards/margins": 2.7421875, "rewards/rejected": -1.9765625, "step": 609 }, { "epoch": 0.6747787610619469, "grad_norm": 13.95426082611084, "learning_rate": 1.2733655520592326e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.1640625, "logps/chosen": -257.5, "logps/rejected": -308.0, "loss": 0.2923, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.10546875, "rewards/margins": 2.9765625, "rewards/rejected": -1.875, "step": 610 }, { "epoch": 0.6758849557522124, "grad_norm": 13.288588523864746, "learning_rate": 1.265561124600057e-07, "logits/chosen": -1.1875, "logits/rejected": -1.0703125, "logps/chosen": -250.0, "logps/rejected": -277.0, "loss": 0.3124, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.875, "rewards/margins": 2.765625, "rewards/rejected": -1.890625, "step": 611 }, { "epoch": 0.6769911504424779, "grad_norm": 14.142792701721191, "learning_rate": 1.2577725738562068e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.18359375, "logps/chosen": -244.5, "logps/rejected": -249.0, "loss": 0.3795, "rewards/accuracies": 0.75, "rewards/chosen": 0.533203125, "rewards/margins": 2.19921875, "rewards/rejected": -1.66796875, "step": 612 }, { "epoch": 0.6780973451327433, "grad_norm": 12.952162742614746, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -1.08984375, "logits/rejected": -1.109375, "logps/chosen": -259.5, "logps/rejected": -315.0, "loss": 0.2972, "rewards/accuracies": 0.828125, "rewards/chosen": 0.9453125, "rewards/margins": 2.984375, "rewards/rejected": -2.03515625, "step": 613 }, { "epoch": 0.6792035398230089, "grad_norm": 12.563096046447754, "learning_rate": 1.2422435029982667e-07, "logits/chosen": -1.33203125, "logits/rejected": -1.2109375, "logps/chosen": -251.0, "logps/rejected": -277.0, "loss": 0.2854, "rewards/accuracies": 0.828125, "rewards/chosen": 1.1640625, "rewards/margins": 2.9921875, "rewards/rejected": -1.828125, "step": 614 }, { "epoch": 0.6803097345132744, "grad_norm": 15.870928764343262, "learning_rate": 1.234503182611066e-07, "logits/chosen": -1.38671875, "logits/rejected": -1.22265625, "logps/chosen": -271.5, "logps/rejected": -310.0, "loss": 0.3588, "rewards/accuracies": 0.75, "rewards/chosen": 0.6884765625, "rewards/margins": 2.390625, "rewards/rejected": -1.70703125, "step": 615 }, { "epoch": 0.6814159292035398, "grad_norm": 12.973315238952637, "learning_rate": 1.2267791383904017e-07, "logits/chosen": -1.23828125, "logits/rejected": -1.12890625, "logps/chosen": -229.0, "logps/rejected": -272.5, "loss": 0.2962, "rewards/accuracies": 0.859375, "rewards/chosen": 1.130859375, "rewards/margins": 3.2109375, "rewards/rejected": -2.09375, "step": 616 }, { "epoch": 0.6825221238938053, "grad_norm": 15.841652870178223, "learning_rate": 1.2190714696789407e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.11328125, "logps/chosen": -266.0, "logps/rejected": -282.0, "loss": 0.3952, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.6005859375, "rewards/margins": 2.2421875, "rewards/rejected": -1.640625, "step": 617 }, { "epoch": 0.6836283185840708, "grad_norm": 14.495512008666992, "learning_rate": 1.2113802756087396e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.15625, "logps/chosen": -251.5, "logps/rejected": -270.5, "loss": 0.3808, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5576171875, "rewards/margins": 2.3359375, "rewards/rejected": -1.77734375, "step": 618 }, { "epoch": 0.6847345132743363, "grad_norm": 13.138040542602539, "learning_rate": 1.2037056550999623e-07, "logits/chosen": -1.08984375, "logits/rejected": -1.046875, "logps/chosen": -261.0, "logps/rejected": -308.0, "loss": 0.3147, "rewards/accuracies": 0.796875, "rewards/chosen": 0.82421875, "rewards/margins": 2.8203125, "rewards/rejected": -1.99609375, "step": 619 }, { "epoch": 0.6858407079646017, "grad_norm": 15.598456382751465, "learning_rate": 1.1960477068596154e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.08984375, "logps/chosen": -266.0, "logps/rejected": -286.0, "loss": 0.3759, "rewards/accuracies": 0.8125, "rewards/chosen": 0.62890625, "rewards/margins": 2.1875, "rewards/rejected": -1.55859375, "step": 620 }, { "epoch": 0.6869469026548672, "grad_norm": 13.848457336425781, "learning_rate": 1.1884065293802756e-07, "logits/chosen": -1.1640625, "logits/rejected": -1.18359375, "logps/chosen": -244.5, "logps/rejected": -257.0, "loss": 0.3068, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.958984375, "rewards/margins": 2.8828125, "rewards/rejected": -1.92578125, "step": 621 }, { "epoch": 0.6880530973451328, "grad_norm": 12.871940612792969, "learning_rate": 1.1807822209388196e-07, "logits/chosen": -1.2890625, "logits/rejected": -1.2109375, "logps/chosen": -239.0, "logps/rejected": -281.0, "loss": 0.2818, "rewards/accuracies": 0.7890625, "rewards/chosen": 1.08984375, "rewards/margins": 3.171875, "rewards/rejected": -2.0859375, "step": 622 }, { "epoch": 0.6891592920353983, "grad_norm": 13.695356369018555, "learning_rate": 1.173174879595166e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.1484375, "logps/chosen": -244.5, "logps/rejected": -276.0, "loss": 0.3137, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.9609375, "rewards/margins": 2.875, "rewards/rejected": -1.91796875, "step": 623 }, { "epoch": 0.6902654867256637, "grad_norm": 16.23243522644043, "learning_rate": 1.1655846031910119e-07, "logits/chosen": -1.359375, "logits/rejected": -1.21484375, "logps/chosen": -253.0, "logps/rejected": -301.0, "loss": 0.3016, "rewards/accuracies": 0.8125, "rewards/chosen": 0.837890625, "rewards/margins": 3.0625, "rewards/rejected": -2.2265625, "step": 624 }, { "epoch": 0.6913716814159292, "grad_norm": 14.047713279724121, "learning_rate": 1.1580114893485712e-07, "logits/chosen": -1.203125, "logits/rejected": -1.18359375, "logps/chosen": -241.0, "logps/rejected": -286.0, "loss": 0.2963, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.08203125, "rewards/margins": 3.265625, "rewards/rejected": -2.1796875, "step": 625 }, { "epoch": 0.6924778761061947, "grad_norm": 13.80639934539795, "learning_rate": 1.1504556354693226e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.15625, "logps/chosen": -248.5, "logps/rejected": -274.0, "loss": 0.317, "rewards/accuracies": 0.796875, "rewards/chosen": 0.9296875, "rewards/margins": 2.84375, "rewards/rejected": -1.91796875, "step": 626 }, { "epoch": 0.6935840707964602, "grad_norm": 13.272629737854004, "learning_rate": 1.1429171387327585e-07, "logits/chosen": -1.34375, "logits/rejected": -1.2109375, "logps/chosen": -238.5, "logps/rejected": -286.0, "loss": 0.2575, "rewards/accuracies": 0.875, "rewards/chosen": 1.07421875, "rewards/margins": 3.328125, "rewards/rejected": -2.25, "step": 627 }, { "epoch": 0.6946902654867256, "grad_norm": 15.396360397338867, "learning_rate": 1.1353960960951293e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.15625, "logps/chosen": -276.0, "logps/rejected": -276.0, "loss": 0.3754, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.822265625, "rewards/margins": 2.4921875, "rewards/rejected": -1.671875, "step": 628 }, { "epoch": 0.6957964601769911, "grad_norm": 13.207889556884766, "learning_rate": 1.1278926042882026e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.1484375, "logps/chosen": -249.5, "logps/rejected": -302.0, "loss": 0.3109, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.80078125, "rewards/margins": 2.9140625, "rewards/rejected": -2.11328125, "step": 629 }, { "epoch": 0.6969026548672567, "grad_norm": 13.04702091217041, "learning_rate": 1.120406759818014e-07, "logits/chosen": -1.29296875, "logits/rejected": -1.1640625, "logps/chosen": -236.5, "logps/rejected": -270.0, "loss": 0.3229, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.900390625, "rewards/margins": 2.6640625, "rewards/rejected": -1.76953125, "step": 630 }, { "epoch": 0.6980088495575221, "grad_norm": 14.894906997680664, "learning_rate": 1.1129386589636292e-07, "logits/chosen": -1.3125, "logits/rejected": -1.18359375, "logps/chosen": -280.0, "logps/rejected": -280.5, "loss": 0.316, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.83203125, "rewards/margins": 2.828125, "rewards/rejected": -2.0, "step": 631 }, { "epoch": 0.6991150442477876, "grad_norm": 16.062137603759766, "learning_rate": 1.1054883977759066e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.140625, "logps/chosen": -275.0, "logps/rejected": -277.0, "loss": 0.3502, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.69140625, "rewards/margins": 2.71875, "rewards/rejected": -2.0234375, "step": 632 }, { "epoch": 0.7002212389380531, "grad_norm": 14.050618171691895, "learning_rate": 1.0980560720762555e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.1484375, "logps/chosen": -248.0, "logps/rejected": -288.0, "loss": 0.3215, "rewards/accuracies": 0.796875, "rewards/chosen": 0.7216796875, "rewards/margins": 2.8125, "rewards/rejected": -2.0859375, "step": 633 }, { "epoch": 0.7013274336283186, "grad_norm": 11.265563011169434, "learning_rate": 1.0906417774554132e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.21484375, "logps/chosen": -234.0, "logps/rejected": -249.5, "loss": 0.2667, "rewards/accuracies": 0.8671875, "rewards/chosen": 1.05859375, "rewards/margins": 3.203125, "rewards/rejected": -2.140625, "step": 634 }, { "epoch": 0.702433628318584, "grad_norm": 13.785270690917969, "learning_rate": 1.0832456092722062e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.18359375, "logps/chosen": -268.0, "logps/rejected": -271.0, "loss": 0.3269, "rewards/accuracies": 0.796875, "rewards/chosen": 0.63671875, "rewards/margins": 2.421875, "rewards/rejected": -1.7890625, "step": 635 }, { "epoch": 0.7035398230088495, "grad_norm": 14.249685287475586, "learning_rate": 1.0758676626523311e-07, "logits/chosen": -1.32421875, "logits/rejected": -1.1796875, "logps/chosen": -265.0, "logps/rejected": -286.0, "loss": 0.314, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.705078125, "rewards/margins": 2.75, "rewards/rejected": -2.046875, "step": 636 }, { "epoch": 0.7046460176991151, "grad_norm": 12.366557121276855, "learning_rate": 1.0685080324871278e-07, "logits/chosen": -1.2421875, "logits/rejected": -1.021484375, "logps/chosen": -256.0, "logps/rejected": -298.0, "loss": 0.27, "rewards/accuracies": 0.859375, "rewards/chosen": 0.75390625, "rewards/margins": 2.65625, "rewards/rejected": -1.90234375, "step": 637 }, { "epoch": 0.7057522123893806, "grad_norm": 16.30191421508789, "learning_rate": 1.0611668134323575e-07, "logits/chosen": -1.30078125, "logits/rejected": -1.1484375, "logps/chosen": -282.0, "logps/rejected": -299.0, "loss": 0.3438, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4140625, "rewards/margins": 2.54296875, "rewards/rejected": -2.1328125, "step": 638 }, { "epoch": 0.706858407079646, "grad_norm": 14.99670696258545, "learning_rate": 1.0538440999069895e-07, "logits/chosen": -1.30859375, "logits/rejected": -1.19921875, "logps/chosen": -255.5, "logps/rejected": -298.0, "loss": 0.3104, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.896484375, "rewards/margins": 2.8671875, "rewards/rejected": -1.96875, "step": 639 }, { "epoch": 0.7079646017699115, "grad_norm": 12.429228782653809, "learning_rate": 1.0465399860919838e-07, "logits/chosen": -1.2265625, "logits/rejected": -1.1953125, "logps/chosen": -255.5, "logps/rejected": -273.0, "loss": 0.2869, "rewards/accuracies": 0.828125, "rewards/chosen": 1.080078125, "rewards/margins": 3.0859375, "rewards/rejected": -2.00390625, "step": 640 }, { "epoch": 0.709070796460177, "grad_norm": 12.204998970031738, "learning_rate": 1.0392545659290788e-07, "logits/chosen": -1.2578125, "logits/rejected": -1.21875, "logps/chosen": -260.5, "logps/rejected": -274.0, "loss": 0.2817, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.810546875, "rewards/margins": 2.9921875, "rewards/rejected": -2.1875, "step": 641 }, { "epoch": 0.7101769911504425, "grad_norm": 14.068879127502441, "learning_rate": 1.0319879331195882e-07, "logits/chosen": -1.21484375, "logits/rejected": -1.0703125, "logps/chosen": -254.0, "logps/rejected": -272.0, "loss": 0.3538, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.591796875, "rewards/margins": 2.5859375, "rewards/rejected": -2.0, "step": 642 }, { "epoch": 0.7112831858407079, "grad_norm": 12.932374954223633, "learning_rate": 1.0247401811231887e-07, "logits/chosen": -1.390625, "logits/rejected": -1.18359375, "logps/chosen": -233.0, "logps/rejected": -259.0, "loss": 0.2886, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.841796875, "rewards/margins": 2.9375, "rewards/rejected": -2.09375, "step": 643 }, { "epoch": 0.7123893805309734, "grad_norm": 12.754419326782227, "learning_rate": 1.0175114031567245e-07, "logits/chosen": -1.27734375, "logits/rejected": -1.2109375, "logps/chosen": -253.0, "logps/rejected": -288.0, "loss": 0.2941, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.798828125, "rewards/margins": 2.671875, "rewards/rejected": -1.875, "step": 644 }, { "epoch": 0.713495575221239, "grad_norm": 13.075281143188477, "learning_rate": 1.0103016921930055e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.18359375, "logps/chosen": -247.5, "logps/rejected": -276.0, "loss": 0.324, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.845703125, "rewards/margins": 2.5546875, "rewards/rejected": -1.70703125, "step": 645 }, { "epoch": 0.7146017699115044, "grad_norm": 15.02340030670166, "learning_rate": 1.0031111409596091e-07, "logits/chosen": -1.15625, "logits/rejected": -1.2109375, "logps/chosen": -246.5, "logps/rejected": -258.0, "loss": 0.3851, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.669921875, "rewards/margins": 2.4140625, "rewards/rejected": -1.7421875, "step": 646 }, { "epoch": 0.7157079646017699, "grad_norm": 12.193872451782227, "learning_rate": 9.95939841937693e-08, "logits/chosen": -1.30859375, "logits/rejected": -1.2578125, "logps/chosen": -259.5, "logps/rejected": -265.5, "loss": 0.2392, "rewards/accuracies": 0.890625, "rewards/chosen": 0.86328125, "rewards/margins": 3.2734375, "rewards/rejected": -2.4140625, "step": 647 }, { "epoch": 0.7168141592920354, "grad_norm": 13.42468547821045, "learning_rate": 9.887878873608027e-08, "logits/chosen": -1.16015625, "logits/rejected": -1.11328125, "logps/chosen": -263.5, "logps/rejected": -290.0, "loss": 0.3087, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.791015625, "rewards/margins": 2.875, "rewards/rejected": -2.08203125, "step": 648 }, { "epoch": 0.7179203539823009, "grad_norm": 13.621614456176758, "learning_rate": 9.816553692136834e-08, "logits/chosen": -1.17578125, "logits/rejected": -1.109375, "logps/chosen": -256.5, "logps/rejected": -282.0, "loss": 0.2806, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1484375, "rewards/margins": 3.046875, "rewards/rejected": -1.89453125, "step": 649 }, { "epoch": 0.7190265486725663, "grad_norm": 13.231938362121582, "learning_rate": 9.745423792310995e-08, "logits/chosen": -1.29296875, "logits/rejected": -1.12890625, "logps/chosen": -243.0, "logps/rejected": -258.5, "loss": 0.2872, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.7265625, "rewards/margins": 2.90625, "rewards/rejected": -2.1796875, "step": 650 }, { "epoch": 0.7190265486725663, "eval_logits/chosen": -1.2634872198104858, "eval_logits/rejected": -1.1566191911697388, "eval_logps/chosen": -252.76119995117188, "eval_logps/rejected": -279.27362060546875, "eval_loss": 0.3217768967151642, "eval_rewards/accuracies": 0.8134269714355469, "eval_rewards/chosen": 0.8057758212089539, "eval_rewards/margins": 2.8031716346740723, "eval_rewards/rejected": -1.996579647064209, "eval_runtime": 193.0564, "eval_samples_per_second": 66.576, "eval_steps_per_second": 1.041, "step": 650 }, { "epoch": 0.7201327433628318, "grad_norm": 13.893576622009277, "learning_rate": 9.674490088966562e-08, "logits/chosen": -1.2734375, "logits/rejected": -1.12109375, "logps/chosen": -262.5, "logps/rejected": -296.0, "loss": 0.2924, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.919921875, "rewards/margins": 3.0859375, "rewards/rejected": -2.1640625, "step": 651 }, { "epoch": 0.7212389380530974, "grad_norm": 13.017692565917969, "learning_rate": 9.603753494416184e-08, "logits/chosen": -1.328125, "logits/rejected": -1.28515625, "logps/chosen": -242.0, "logps/rejected": -248.0, "loss": 0.2897, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.791015625, "rewards/margins": 2.7109375, "rewards/rejected": -1.9140625, "step": 652 }, { "epoch": 0.7223451327433629, "grad_norm": 17.18537712097168, "learning_rate": 9.533214918437421e-08, "logits/chosen": -1.25390625, "logits/rejected": -1.23828125, "logps/chosen": -283.0, "logps/rejected": -287.0, "loss": 0.402, "rewards/accuracies": 0.796875, "rewards/chosen": 0.509765625, "rewards/margins": 2.1640625, "rewards/rejected": -1.65625, "step": 653 }, { "epoch": 0.7234513274336283, "grad_norm": 17.223974227905273, "learning_rate": 9.462875268261e-08, "logits/chosen": -1.19140625, "logits/rejected": -1.2578125, "logps/chosen": -297.0, "logps/rejected": -311.0, "loss": 0.3244, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.814453125, "rewards/margins": 2.859375, "rewards/rejected": -2.046875, "step": 654 }, { "epoch": 0.7245575221238938, "grad_norm": 14.414237976074219, "learning_rate": 9.39273544855918e-08, "logits/chosen": -1.24609375, "logits/rejected": -1.10546875, "logps/chosen": -259.0, "logps/rejected": -303.0, "loss": 0.3058, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.888671875, "rewards/margins": 3.140625, "rewards/rejected": -2.25, "step": 655 }, { "epoch": 0.7256637168141593, "grad_norm": 13.708487510681152, "learning_rate": 9.32279636143411e-08, "logits/chosen": -1.359375, "logits/rejected": -1.16015625, "logps/chosen": -271.0, "logps/rejected": -277.5, "loss": 0.3109, "rewards/accuracies": 0.828125, "rewards/chosen": 0.89453125, "rewards/margins": 2.640625, "rewards/rejected": -1.7421875, "step": 656 }, { "epoch": 0.7267699115044248, "grad_norm": 14.687643051147461, "learning_rate": 9.253058906406194e-08, "logits/chosen": -1.203125, "logits/rejected": -1.0625, "logps/chosen": -280.0, "logps/rejected": -306.0, "loss": 0.314, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.810546875, "rewards/margins": 2.7734375, "rewards/rejected": -1.96875, "step": 657 }, { "epoch": 0.7278761061946902, "grad_norm": 13.893321990966797, "learning_rate": 9.183523980402582e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.203125, "logps/chosen": -240.0, "logps/rejected": -291.0, "loss": 0.3241, "rewards/accuracies": 0.828125, "rewards/chosen": 0.82421875, "rewards/margins": 2.6875, "rewards/rejected": -1.87109375, "step": 658 }, { "epoch": 0.7289823008849557, "grad_norm": 13.528450965881348, "learning_rate": 9.114192477745566e-08, "logits/chosen": -1.3359375, "logits/rejected": -1.1484375, "logps/chosen": -262.0, "logps/rejected": -278.0, "loss": 0.3098, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.6640625, "rewards/margins": 2.6953125, "rewards/rejected": -2.03515625, "step": 659 }, { "epoch": 0.7300884955752213, "grad_norm": 15.182424545288086, "learning_rate": 9.045065290141138e-08, "logits/chosen": -1.22265625, "logits/rejected": -1.1015625, "logps/chosen": -275.0, "logps/rejected": -305.0, "loss": 0.3081, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.822265625, "rewards/margins": 2.8046875, "rewards/rejected": -1.984375, "step": 660 }, { "epoch": 0.7311946902654868, "grad_norm": 14.025420188903809, "learning_rate": 8.976143306667491e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.16015625, "logps/chosen": -255.5, "logps/rejected": -290.0, "loss": 0.2861, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.857421875, "rewards/margins": 2.9765625, "rewards/rejected": -2.1171875, "step": 661 }, { "epoch": 0.7323008849557522, "grad_norm": 12.591769218444824, "learning_rate": 8.907427413763572e-08, "logits/chosen": -1.32421875, "logits/rejected": -1.1171875, "logps/chosen": -268.0, "logps/rejected": -275.5, "loss": 0.2648, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.677734375, "rewards/margins": 2.9453125, "rewards/rejected": -2.265625, "step": 662 }, { "epoch": 0.7334070796460177, "grad_norm": 15.431063652038574, "learning_rate": 8.838918495217712e-08, "logits/chosen": -1.25390625, "logits/rejected": -1.14453125, "logps/chosen": -269.5, "logps/rejected": -304.0, "loss": 0.3575, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.787109375, "rewards/margins": 2.640625, "rewards/rejected": -1.84765625, "step": 663 }, { "epoch": 0.7345132743362832, "grad_norm": 14.970857620239258, "learning_rate": 8.770617432156257e-08, "logits/chosen": -1.28515625, "logits/rejected": -1.05859375, "logps/chosen": -268.0, "logps/rejected": -298.0, "loss": 0.3506, "rewards/accuracies": 0.796875, "rewards/chosen": 0.708984375, "rewards/margins": 2.4921875, "rewards/rejected": -1.78125, "step": 664 }, { "epoch": 0.7356194690265486, "grad_norm": 15.439310073852539, "learning_rate": 8.702525103032184e-08, "logits/chosen": -1.2421875, "logits/rejected": -1.07421875, "logps/chosen": -248.0, "logps/rejected": -280.5, "loss": 0.3629, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.7626953125, "rewards/margins": 2.59375, "rewards/rejected": -1.828125, "step": 665 }, { "epoch": 0.7367256637168141, "grad_norm": 13.227315902709961, "learning_rate": 8.634642383613891e-08, "logits/chosen": -1.21875, "logits/rejected": -1.11328125, "logps/chosen": -254.5, "logps/rejected": -285.0, "loss": 0.3095, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.8984375, "rewards/margins": 2.875, "rewards/rejected": -1.97265625, "step": 666 }, { "epoch": 0.7378318584070797, "grad_norm": 12.241044044494629, "learning_rate": 8.566970146973835e-08, "logits/chosen": -1.30859375, "logits/rejected": -1.125, "logps/chosen": -257.0, "logps/rejected": -293.0, "loss": 0.2911, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.849609375, "rewards/margins": 3.0, "rewards/rejected": -2.15234375, "step": 667 }, { "epoch": 0.7389380530973452, "grad_norm": 12.409917831420898, "learning_rate": 8.499509263477387e-08, "logits/chosen": -1.375, "logits/rejected": -1.15234375, "logps/chosen": -222.0, "logps/rejected": -269.0, "loss": 0.285, "rewards/accuracies": 0.84375, "rewards/chosen": 1.044921875, "rewards/margins": 3.109375, "rewards/rejected": -2.05859375, "step": 668 }, { "epoch": 0.7400442477876106, "grad_norm": 16.232877731323242, "learning_rate": 8.432260600771599e-08, "logits/chosen": -1.31640625, "logits/rejected": -1.171875, "logps/chosen": -278.0, "logps/rejected": -274.0, "loss": 0.3434, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6806640625, "rewards/margins": 2.796875, "rewards/rejected": -2.11328125, "step": 669 }, { "epoch": 0.7411504424778761, "grad_norm": 12.330305099487305, "learning_rate": 8.36522502377403e-08, "logits/chosen": -1.34375, "logits/rejected": -1.1640625, "logps/chosen": -239.5, "logps/rejected": -292.0, "loss": 0.2725, "rewards/accuracies": 0.84375, "rewards/chosen": 0.86328125, "rewards/margins": 2.7734375, "rewards/rejected": -1.9140625, "step": 670 }, { "epoch": 0.7422566371681416, "grad_norm": 15.042512893676758, "learning_rate": 8.298403394661657e-08, "logits/chosen": -1.24609375, "logits/rejected": -1.18359375, "logps/chosen": -278.0, "logps/rejected": -262.0, "loss": 0.3643, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.53515625, "rewards/margins": 2.4609375, "rewards/rejected": -1.92578125, "step": 671 }, { "epoch": 0.7433628318584071, "grad_norm": 15.917474746704102, "learning_rate": 8.231796572859778e-08, "logits/chosen": -1.09765625, "logits/rejected": -1.12109375, "logps/chosen": -250.5, "logps/rejected": -301.0, "loss": 0.2963, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.0, "rewards/margins": 3.1796875, "rewards/rejected": -2.1796875, "step": 672 }, { "epoch": 0.7444690265486725, "grad_norm": 13.662030220031738, "learning_rate": 8.165405415030915e-08, "logits/chosen": -1.35546875, "logits/rejected": -1.14453125, "logps/chosen": -288.0, "logps/rejected": -279.0, "loss": 0.2763, "rewards/accuracies": 0.859375, "rewards/chosen": 0.91015625, "rewards/margins": 2.953125, "rewards/rejected": -2.0390625, "step": 673 }, { "epoch": 0.745575221238938, "grad_norm": 14.487608909606934, "learning_rate": 8.099230775063879e-08, "logits/chosen": -1.2890625, "logits/rejected": -1.140625, "logps/chosen": -261.0, "logps/rejected": -277.0, "loss": 0.319, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.75390625, "rewards/margins": 2.7109375, "rewards/rejected": -1.953125, "step": 674 }, { "epoch": 0.7466814159292036, "grad_norm": 15.394200325012207, "learning_rate": 8.033273504062698e-08, "logits/chosen": -1.12109375, "logits/rejected": -1.11328125, "logps/chosen": -267.0, "logps/rejected": -314.0, "loss": 0.3292, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.70703125, "rewards/margins": 2.84375, "rewards/rejected": -2.1328125, "step": 675 }, { "epoch": 0.7477876106194691, "grad_norm": 16.063007354736328, "learning_rate": 7.967534450335728e-08, "logits/chosen": -1.32421875, "logits/rejected": -1.19140625, "logps/chosen": -253.5, "logps/rejected": -266.5, "loss": 0.3824, "rewards/accuracies": 0.78125, "rewards/chosen": 0.556640625, "rewards/margins": 2.4453125, "rewards/rejected": -1.88671875, "step": 676 }, { "epoch": 0.7488938053097345, "grad_norm": 15.266008377075195, "learning_rate": 7.902014459384742e-08, "logits/chosen": -1.21875, "logits/rejected": -1.03515625, "logps/chosen": -259.0, "logps/rejected": -301.0, "loss": 0.3159, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.875, "rewards/margins": 3.1015625, "rewards/rejected": -2.2265625, "step": 677 }, { "epoch": 0.75, "grad_norm": 11.596745491027832, "learning_rate": 7.836714373894015e-08, "logits/chosen": -1.1484375, "logits/rejected": -1.05078125, "logps/chosen": -248.5, "logps/rejected": -268.5, "loss": 0.224, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.046875, "rewards/margins": 3.390625, "rewards/rejected": -2.3359375, "step": 678 }, { "epoch": 0.7511061946902655, "grad_norm": 12.86449909210205, "learning_rate": 7.771635033719528e-08, "logits/chosen": -1.26171875, "logits/rejected": -1.09375, "logps/chosen": -271.0, "logps/rejected": -258.5, "loss": 0.2782, "rewards/accuracies": 0.875, "rewards/chosen": 0.80078125, "rewards/margins": 2.8359375, "rewards/rejected": -2.03125, "step": 679 }, { "epoch": 0.7522123893805309, "grad_norm": 12.727359771728516, "learning_rate": 7.70677727587816e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.0859375, "logps/chosen": -257.0, "logps/rejected": -295.0, "loss": 0.2793, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.70703125, "rewards/margins": 2.734375, "rewards/rejected": -2.0234375, "step": 680 }, { "epoch": 0.7533185840707964, "grad_norm": 12.862136840820312, "learning_rate": 7.642141934536874e-08, "logits/chosen": -1.3203125, "logits/rejected": -1.1953125, "logps/chosen": -242.5, "logps/rejected": -268.0, "loss": 0.2937, "rewards/accuracies": 0.859375, "rewards/chosen": 0.720703125, "rewards/margins": 3.0546875, "rewards/rejected": -2.328125, "step": 681 }, { "epoch": 0.754424778761062, "grad_norm": 13.950096130371094, "learning_rate": 7.577729841002075e-08, "logits/chosen": -1.17578125, "logits/rejected": -1.12890625, "logps/chosen": -279.0, "logps/rejected": -308.0, "loss": 0.2855, "rewards/accuracies": 0.859375, "rewards/chosen": 0.703125, "rewards/margins": 2.875, "rewards/rejected": -2.171875, "step": 682 }, { "epoch": 0.7555309734513275, "grad_norm": 15.024590492248535, "learning_rate": 7.513541823708827e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.140625, "logps/chosen": -251.0, "logps/rejected": -292.0, "loss": 0.3303, "rewards/accuracies": 0.796875, "rewards/chosen": 0.697265625, "rewards/margins": 2.75, "rewards/rejected": -2.05078125, "step": 683 }, { "epoch": 0.7566371681415929, "grad_norm": 14.337872505187988, "learning_rate": 7.449578708210267e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.21484375, "logps/chosen": -283.0, "logps/rejected": -276.0, "loss": 0.3292, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.71484375, "rewards/margins": 2.8515625, "rewards/rejected": -2.14453125, "step": 684 }, { "epoch": 0.7577433628318584, "grad_norm": 13.712812423706055, "learning_rate": 7.385841317166966e-08, "logits/chosen": -1.30078125, "logits/rejected": -1.1953125, "logps/chosen": -250.5, "logps/rejected": -283.0, "loss": 0.309, "rewards/accuracies": 0.828125, "rewards/chosen": 0.654296875, "rewards/margins": 2.8515625, "rewards/rejected": -2.1953125, "step": 685 }, { "epoch": 0.7588495575221239, "grad_norm": 11.522303581237793, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.43359375, "logits/rejected": -1.15625, "logps/chosen": -247.0, "logps/rejected": -287.0, "loss": 0.2535, "rewards/accuracies": 0.890625, "rewards/chosen": 0.541015625, "rewards/margins": 2.796875, "rewards/rejected": -2.265625, "step": 686 }, { "epoch": 0.7599557522123894, "grad_norm": 16.617996215820312, "learning_rate": 7.25904698456203e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.0859375, "logps/chosen": -283.0, "logps/rejected": -304.0, "loss": 0.3768, "rewards/accuracies": 0.765625, "rewards/chosen": 0.59375, "rewards/margins": 2.578125, "rewards/rejected": -1.98828125, "step": 687 }, { "epoch": 0.7610619469026548, "grad_norm": 13.16273021697998, "learning_rate": 7.195991673763644e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.140625, "logps/chosen": -256.0, "logps/rejected": -261.5, "loss": 0.342, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.5322265625, "rewards/margins": 2.6015625, "rewards/rejected": -2.078125, "step": 688 }, { "epoch": 0.7621681415929203, "grad_norm": 14.337390899658203, "learning_rate": 7.133165348925976e-08, "logits/chosen": -1.25, "logits/rejected": -1.2734375, "logps/chosen": -253.5, "logps/rejected": -281.0, "loss": 0.3474, "rewards/accuracies": 0.7578125, "rewards/chosen": 1.0078125, "rewards/margins": 2.8359375, "rewards/rejected": -1.83203125, "step": 689 }, { "epoch": 0.7632743362831859, "grad_norm": 13.05460262298584, "learning_rate": 7.070568818088782e-08, "logits/chosen": -1.29296875, "logits/rejected": -1.06640625, "logps/chosen": -266.5, "logps/rejected": -289.5, "loss": 0.3306, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.76171875, "rewards/margins": 2.6875, "rewards/rejected": -1.921875, "step": 690 }, { "epoch": 0.7643805309734514, "grad_norm": 13.12061595916748, "learning_rate": 7.008202886336323e-08, "logits/chosen": -1.296875, "logits/rejected": -1.11328125, "logps/chosen": -252.0, "logps/rejected": -294.0, "loss": 0.3064, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6943359375, "rewards/margins": 2.96875, "rewards/rejected": -2.2734375, "step": 691 }, { "epoch": 0.7654867256637168, "grad_norm": 15.881913185119629, "learning_rate": 6.94606835578699e-08, "logits/chosen": -1.2734375, "logits/rejected": -1.09375, "logps/chosen": -267.5, "logps/rejected": -279.0, "loss": 0.39, "rewards/accuracies": 0.796875, "rewards/chosen": 0.5244140625, "rewards/margins": 2.265625, "rewards/rejected": -1.734375, "step": 692 }, { "epoch": 0.7665929203539823, "grad_norm": 18.117141723632812, "learning_rate": 6.884166025583043e-08, "logits/chosen": -1.19140625, "logits/rejected": -1.13671875, "logps/chosen": -289.0, "logps/rejected": -318.0, "loss": 0.3893, "rewards/accuracies": 0.734375, "rewards/chosen": 0.591796875, "rewards/margins": 2.71875, "rewards/rejected": -2.1328125, "step": 693 }, { "epoch": 0.7676991150442478, "grad_norm": 14.054704666137695, "learning_rate": 6.822496691880275e-08, "logits/chosen": -1.34765625, "logits/rejected": -1.18359375, "logps/chosen": -250.5, "logps/rejected": -272.0, "loss": 0.3268, "rewards/accuracies": 0.796875, "rewards/chosen": 0.708984375, "rewards/margins": 2.6875, "rewards/rejected": -1.9765625, "step": 694 }, { "epoch": 0.7688053097345132, "grad_norm": 11.454045295715332, "learning_rate": 6.761061147837807e-08, "logits/chosen": -1.41796875, "logits/rejected": -1.12890625, "logps/chosen": -254.5, "logps/rejected": -293.0, "loss": 0.2523, "rewards/accuracies": 0.84375, "rewards/chosen": 0.861328125, "rewards/margins": 3.1484375, "rewards/rejected": -2.28125, "step": 695 }, { "epoch": 0.7699115044247787, "grad_norm": 13.245506286621094, "learning_rate": 6.699860183607894e-08, "logits/chosen": -1.359375, "logits/rejected": -1.140625, "logps/chosen": -275.0, "logps/rejected": -273.0, "loss": 0.3098, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4716796875, "rewards/margins": 2.71875, "rewards/rejected": -2.25, "step": 696 }, { "epoch": 0.7710176991150443, "grad_norm": 13.23479175567627, "learning_rate": 6.638894586325719e-08, "logits/chosen": -1.2890625, "logits/rejected": -1.19140625, "logps/chosen": -246.0, "logps/rejected": -288.0, "loss": 0.2909, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.5712890625, "rewards/margins": 2.859375, "rewards/rejected": -2.296875, "step": 697 }, { "epoch": 0.7721238938053098, "grad_norm": 14.469060897827148, "learning_rate": 6.578165140099317e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.20703125, "logps/chosen": -252.5, "logps/rejected": -277.0, "loss": 0.3493, "rewards/accuracies": 0.796875, "rewards/chosen": 0.79296875, "rewards/margins": 2.8359375, "rewards/rejected": -2.04296875, "step": 698 }, { "epoch": 0.7732300884955752, "grad_norm": 15.471959114074707, "learning_rate": 6.517672625999465e-08, "logits/chosen": -1.20703125, "logits/rejected": -1.09375, "logps/chosen": -254.0, "logps/rejected": -284.0, "loss": 0.3456, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.5849609375, "rewards/margins": 2.78125, "rewards/rejected": -2.203125, "step": 699 }, { "epoch": 0.7743362831858407, "grad_norm": 13.424947738647461, "learning_rate": 6.457417822049627e-08, "logits/chosen": -1.35546875, "logits/rejected": -1.13671875, "logps/chosen": -260.0, "logps/rejected": -280.0, "loss": 0.3278, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.681640625, "rewards/margins": 2.5703125, "rewards/rejected": -1.890625, "step": 700 }, { "epoch": 0.7743362831858407, "eval_logits/chosen": -1.267957091331482, "eval_logits/rejected": -1.1595537662506104, "eval_logps/chosen": -253.52735900878906, "eval_logps/rejected": -279.9950256347656, "eval_loss": 0.3205508887767792, "eval_rewards/accuracies": 0.8137379288673401, "eval_rewards/chosen": 0.73013836145401, "eval_rewards/margins": 2.814093589782715, "eval_rewards/rejected": -2.0833332538604736, "eval_runtime": 193.1227, "eval_samples_per_second": 66.554, "eval_steps_per_second": 1.041, "step": 700 }, { "epoch": 0.7754424778761062, "grad_norm": 14.452095985412598, "learning_rate": 6.397401503215991e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.125, "logps/chosen": -268.0, "logps/rejected": -296.0, "loss": 0.3012, "rewards/accuracies": 0.796875, "rewards/chosen": 0.92578125, "rewards/margins": 3.0703125, "rewards/rejected": -2.14453125, "step": 701 }, { "epoch": 0.7765486725663717, "grad_norm": 13.497214317321777, "learning_rate": 6.33762444139744e-08, "logits/chosen": -1.53515625, "logits/rejected": -1.15625, "logps/chosen": -244.5, "logps/rejected": -292.0, "loss": 0.3147, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.78515625, "rewards/margins": 2.78125, "rewards/rejected": -2.0, "step": 702 }, { "epoch": 0.7776548672566371, "grad_norm": 13.972166061401367, "learning_rate": 6.278087405415683e-08, "logits/chosen": -1.3125, "logits/rejected": -1.140625, "logps/chosen": -258.0, "logps/rejected": -260.0, "loss": 0.2868, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.8046875, "rewards/margins": 3.03125, "rewards/rejected": -2.2265625, "step": 703 }, { "epoch": 0.7787610619469026, "grad_norm": 13.861830711364746, "learning_rate": 6.218791161005335e-08, "logits/chosen": -1.2265625, "logits/rejected": -1.09765625, "logps/chosen": -238.5, "logps/rejected": -299.0, "loss": 0.2945, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.76953125, "rewards/margins": 3.0, "rewards/rejected": -2.234375, "step": 704 }, { "epoch": 0.7798672566371682, "grad_norm": 15.766735076904297, "learning_rate": 6.159736470804059e-08, "logits/chosen": -1.3046875, "logits/rejected": -1.2421875, "logps/chosen": -250.5, "logps/rejected": -261.0, "loss": 0.3834, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.6416015625, "rewards/margins": 2.21875, "rewards/rejected": -1.57421875, "step": 705 }, { "epoch": 0.7809734513274337, "grad_norm": 13.845602989196777, "learning_rate": 6.100924094342785e-08, "logits/chosen": -1.4140625, "logits/rejected": -1.19921875, "logps/chosen": -230.5, "logps/rejected": -236.0, "loss": 0.3024, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.705078125, "rewards/margins": 2.9609375, "rewards/rejected": -2.2578125, "step": 706 }, { "epoch": 0.7820796460176991, "grad_norm": 14.929936408996582, "learning_rate": 6.042354788035942e-08, "logits/chosen": -1.18359375, "logits/rejected": -1.04296875, "logps/chosen": -269.0, "logps/rejected": -291.0, "loss": 0.3403, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.6162109375, "rewards/margins": 2.7109375, "rewards/rejected": -2.09765625, "step": 707 }, { "epoch": 0.7831858407079646, "grad_norm": 13.97938346862793, "learning_rate": 5.984029305171678e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.2734375, "logps/chosen": -245.0, "logps/rejected": -287.0, "loss": 0.2896, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.0, "rewards/margins": 3.21875, "rewards/rejected": -2.21875, "step": 708 }, { "epoch": 0.7842920353982301, "grad_norm": 14.008685111999512, "learning_rate": 5.925948395902253e-08, "logits/chosen": -1.32421875, "logits/rejected": -1.21875, "logps/chosen": -272.0, "logps/rejected": -313.0, "loss": 0.3008, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.884765625, "rewards/margins": 2.9765625, "rewards/rejected": -2.0859375, "step": 709 }, { "epoch": 0.7853982300884956, "grad_norm": 12.459348678588867, "learning_rate": 5.868112807234313e-08, "logits/chosen": -1.29296875, "logits/rejected": -1.140625, "logps/chosen": -269.0, "logps/rejected": -371.0, "loss": 0.262, "rewards/accuracies": 0.875, "rewards/chosen": 0.994140625, "rewards/margins": 3.0390625, "rewards/rejected": -2.046875, "step": 710 }, { "epoch": 0.786504424778761, "grad_norm": 14.358124732971191, "learning_rate": 5.810523283019339e-08, "logits/chosen": -1.3046875, "logits/rejected": -1.1875, "logps/chosen": -282.0, "logps/rejected": -273.0, "loss": 0.3423, "rewards/accuracies": 0.859375, "rewards/chosen": 0.484375, "rewards/margins": 2.484375, "rewards/rejected": -2.00390625, "step": 711 }, { "epoch": 0.7876106194690266, "grad_norm": 12.388589859008789, "learning_rate": 5.753180563944057e-08, "logits/chosen": -1.37109375, "logits/rejected": -1.09375, "logps/chosen": -232.0, "logps/rejected": -247.5, "loss": 0.2437, "rewards/accuracies": 0.859375, "rewards/chosen": 0.931640625, "rewards/margins": 3.46875, "rewards/rejected": -2.5390625, "step": 712 }, { "epoch": 0.7887168141592921, "grad_norm": 12.301764488220215, "learning_rate": 5.6960853875208935e-08, "logits/chosen": -1.19921875, "logits/rejected": -1.1640625, "logps/chosen": -252.0, "logps/rejected": -267.0, "loss": 0.3027, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.671875, "rewards/margins": 2.8125, "rewards/rejected": -2.1328125, "step": 713 }, { "epoch": 0.7898230088495575, "grad_norm": 14.501238822937012, "learning_rate": 5.6392384880785294e-08, "logits/chosen": -1.37109375, "logits/rejected": -1.2265625, "logps/chosen": -276.0, "logps/rejected": -285.0, "loss": 0.3198, "rewards/accuracies": 0.8125, "rewards/chosen": 0.501953125, "rewards/margins": 2.6328125, "rewards/rejected": -2.125, "step": 714 }, { "epoch": 0.790929203539823, "grad_norm": 12.956294059753418, "learning_rate": 5.5826405967524357e-08, "logits/chosen": -1.1484375, "logits/rejected": -1.0859375, "logps/chosen": -255.0, "logps/rejected": -297.0, "loss": 0.272, "rewards/accuracies": 0.859375, "rewards/chosen": 0.779296875, "rewards/margins": 3.203125, "rewards/rejected": -2.421875, "step": 715 }, { "epoch": 0.7920353982300885, "grad_norm": 14.246673583984375, "learning_rate": 5.526292441475447e-08, "logits/chosen": -1.32421875, "logits/rejected": -1.140625, "logps/chosen": -269.0, "logps/rejected": -308.0, "loss": 0.2897, "rewards/accuracies": 0.875, "rewards/chosen": 0.818359375, "rewards/margins": 2.765625, "rewards/rejected": -1.94921875, "step": 716 }, { "epoch": 0.793141592920354, "grad_norm": 14.141976356506348, "learning_rate": 5.470194746968451e-08, "logits/chosen": -1.265625, "logits/rejected": -1.2265625, "logps/chosen": -246.0, "logps/rejected": -288.0, "loss": 0.3056, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7265625, "rewards/margins": 3.0859375, "rewards/rejected": -2.359375, "step": 717 }, { "epoch": 0.7942477876106194, "grad_norm": 13.89908218383789, "learning_rate": 5.4143482347310116e-08, "logits/chosen": -1.3046875, "logits/rejected": -1.10546875, "logps/chosen": -273.0, "logps/rejected": -295.0, "loss": 0.3041, "rewards/accuracies": 0.84375, "rewards/chosen": 0.87890625, "rewards/margins": 2.828125, "rewards/rejected": -1.94921875, "step": 718 }, { "epoch": 0.7953539823008849, "grad_norm": 11.489982604980469, "learning_rate": 5.358753623032136e-08, "logits/chosen": -1.359375, "logits/rejected": -1.12109375, "logps/chosen": -228.0, "logps/rejected": -249.0, "loss": 0.2602, "rewards/accuracies": 0.859375, "rewards/chosen": 0.88671875, "rewards/margins": 3.09375, "rewards/rejected": -2.2109375, "step": 719 }, { "epoch": 0.7964601769911505, "grad_norm": 13.466360092163086, "learning_rate": 5.3034116269010194e-08, "logits/chosen": -1.453125, "logits/rejected": -1.234375, "logps/chosen": -269.0, "logps/rejected": -295.0, "loss": 0.3119, "rewards/accuracies": 0.796875, "rewards/chosen": 0.59765625, "rewards/margins": 2.4921875, "rewards/rejected": -1.890625, "step": 720 }, { "epoch": 0.797566371681416, "grad_norm": 15.516824722290039, "learning_rate": 5.248322958117815e-08, "logits/chosen": -1.1875, "logits/rejected": -1.15625, "logps/chosen": -261.0, "logps/rejected": -273.0, "loss": 0.361, "rewards/accuracies": 0.765625, "rewards/chosen": 0.91015625, "rewards/margins": 2.6875, "rewards/rejected": -1.78125, "step": 721 }, { "epoch": 0.7986725663716814, "grad_norm": 14.861969947814941, "learning_rate": 5.1934883252045507e-08, "logits/chosen": -1.234375, "logits/rejected": -1.16796875, "logps/chosen": -249.5, "logps/rejected": -279.0, "loss": 0.3549, "rewards/accuracies": 0.75, "rewards/chosen": 0.65625, "rewards/margins": 2.8046875, "rewards/rejected": -2.1484375, "step": 722 }, { "epoch": 0.7997787610619469, "grad_norm": 14.74849796295166, "learning_rate": 5.138908433415945e-08, "logits/chosen": -1.28125, "logits/rejected": -1.23828125, "logps/chosen": -271.0, "logps/rejected": -311.0, "loss": 0.2943, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.8203125, "rewards/margins": 3.1171875, "rewards/rejected": -2.3046875, "step": 723 }, { "epoch": 0.8008849557522124, "grad_norm": 13.291254043579102, "learning_rate": 5.0845839847303894e-08, "logits/chosen": -1.25390625, "logits/rejected": -1.11328125, "logps/chosen": -244.5, "logps/rejected": -257.0, "loss": 0.3242, "rewards/accuracies": 0.796875, "rewards/chosen": 0.666015625, "rewards/margins": 2.8671875, "rewards/rejected": -2.1953125, "step": 724 }, { "epoch": 0.8019911504424779, "grad_norm": 12.395694732666016, "learning_rate": 5.030515677840882e-08, "logits/chosen": -1.1875, "logits/rejected": -1.1171875, "logps/chosen": -240.5, "logps/rejected": -276.0, "loss": 0.3041, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.7890625, "rewards/margins": 3.03125, "rewards/rejected": -2.2421875, "step": 725 }, { "epoch": 0.8030973451327433, "grad_norm": 13.156864166259766, "learning_rate": 4.9767042081460626e-08, "logits/chosen": -1.421875, "logits/rejected": -1.12109375, "logps/chosen": -253.5, "logps/rejected": -286.0, "loss": 0.2806, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.701171875, "rewards/margins": 2.8359375, "rewards/rejected": -2.1328125, "step": 726 }, { "epoch": 0.8042035398230089, "grad_norm": 13.708073616027832, "learning_rate": 4.923150267741266e-08, "logits/chosen": -1.1953125, "logits/rejected": -1.13671875, "logps/chosen": -272.0, "logps/rejected": -310.0, "loss": 0.2606, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.796875, "rewards/margins": 3.2734375, "rewards/rejected": -2.4765625, "step": 727 }, { "epoch": 0.8053097345132744, "grad_norm": 13.454339981079102, "learning_rate": 4.869854545409627e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.2109375, "logps/chosen": -243.0, "logps/rejected": -296.0, "loss": 0.2951, "rewards/accuracies": 0.84375, "rewards/chosen": 0.841796875, "rewards/margins": 2.8671875, "rewards/rejected": -2.02734375, "step": 728 }, { "epoch": 0.8064159292035398, "grad_norm": 13.385002136230469, "learning_rate": 4.816817726613187e-08, "logits/chosen": -1.30078125, "logits/rejected": -1.20703125, "logps/chosen": -255.5, "logps/rejected": -267.0, "loss": 0.3009, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.5869140625, "rewards/margins": 2.875, "rewards/rejected": -2.2890625, "step": 729 }, { "epoch": 0.8075221238938053, "grad_norm": 11.77560806274414, "learning_rate": 4.7640404934841284e-08, "logits/chosen": -1.25390625, "logits/rejected": -1.1171875, "logps/chosen": -239.5, "logps/rejected": -256.5, "loss": 0.2937, "rewards/accuracies": 0.828125, "rewards/chosen": 0.65234375, "rewards/margins": 2.7734375, "rewards/rejected": -2.12109375, "step": 730 }, { "epoch": 0.8086283185840708, "grad_norm": 14.025035858154297, "learning_rate": 4.7115235248159776e-08, "logits/chosen": -1.3359375, "logits/rejected": -1.14453125, "logps/chosen": -283.0, "logps/rejected": -303.0, "loss": 0.2726, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.037109375, "rewards/margins": 3.1328125, "rewards/rejected": -2.10546875, "step": 731 }, { "epoch": 0.8097345132743363, "grad_norm": 13.514138221740723, "learning_rate": 4.659267496054847e-08, "logits/chosen": -1.2890625, "logits/rejected": -1.091796875, "logps/chosen": -248.5, "logps/rejected": -266.5, "loss": 0.2988, "rewards/accuracies": 0.84375, "rewards/chosen": 0.646484375, "rewards/margins": 2.84375, "rewards/rejected": -2.1953125, "step": 732 }, { "epoch": 0.8108407079646017, "grad_norm": 15.020828247070312, "learning_rate": 4.60727307929081e-08, "logits/chosen": -1.28125, "logits/rejected": -1.06640625, "logps/chosen": -258.5, "logps/rejected": -275.0, "loss": 0.3037, "rewards/accuracies": 0.8125, "rewards/chosen": 0.75390625, "rewards/margins": 3.09375, "rewards/rejected": -2.34375, "step": 733 }, { "epoch": 0.8119469026548672, "grad_norm": 14.957762718200684, "learning_rate": 4.555540943249187e-08, "logits/chosen": -1.3515625, "logits/rejected": -1.19921875, "logps/chosen": -248.5, "logps/rejected": -304.0, "loss": 0.3, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.705078125, "rewards/margins": 2.8515625, "rewards/rejected": -2.1484375, "step": 734 }, { "epoch": 0.8130530973451328, "grad_norm": 12.412934303283691, "learning_rate": 4.5040717532820046e-08, "logits/chosen": -1.28125, "logits/rejected": -1.15234375, "logps/chosen": -257.5, "logps/rejected": -296.0, "loss": 0.282, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.7607421875, "rewards/margins": 2.953125, "rewards/rejected": -2.1953125, "step": 735 }, { "epoch": 0.8141592920353983, "grad_norm": 15.76734733581543, "learning_rate": 4.4528661713594125e-08, "logits/chosen": -1.3515625, "logits/rejected": -1.1796875, "logps/chosen": -238.5, "logps/rejected": -262.0, "loss": 0.3355, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8427734375, "rewards/margins": 2.96875, "rewards/rejected": -2.125, "step": 736 }, { "epoch": 0.8152654867256637, "grad_norm": 16.009498596191406, "learning_rate": 4.4019248560611454e-08, "logits/chosen": -1.1953125, "logits/rejected": -1.16796875, "logps/chosen": -271.0, "logps/rejected": -292.0, "loss": 0.34, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8203125, "rewards/margins": 2.7578125, "rewards/rejected": -1.9375, "step": 737 }, { "epoch": 0.8163716814159292, "grad_norm": 12.171030044555664, "learning_rate": 4.3512484625681e-08, "logits/chosen": -1.29296875, "logits/rejected": -1.046875, "logps/chosen": -257.0, "logps/rejected": -285.0, "loss": 0.2528, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.6953125, "rewards/margins": 2.9921875, "rewards/rejected": -2.2890625, "step": 738 }, { "epoch": 0.8174778761061947, "grad_norm": 14.278532981872559, "learning_rate": 4.3008376426538903e-08, "logits/chosen": -1.31640625, "logits/rejected": -1.2265625, "logps/chosen": -250.5, "logps/rejected": -258.5, "loss": 0.3722, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.5029296875, "rewards/margins": 2.4921875, "rewards/rejected": -1.9921875, "step": 739 }, { "epoch": 0.8185840707964602, "grad_norm": 12.9563570022583, "learning_rate": 4.250693044676429e-08, "logits/chosen": -1.2734375, "logits/rejected": -1.17578125, "logps/chosen": -270.0, "logps/rejected": -287.0, "loss": 0.2685, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.685546875, "rewards/margins": 3.1640625, "rewards/rejected": -2.4765625, "step": 740 }, { "epoch": 0.8196902654867256, "grad_norm": 13.755107879638672, "learning_rate": 4.2008153135696584e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.09375, "logps/chosen": -246.5, "logps/rejected": -285.0, "loss": 0.3042, "rewards/accuracies": 0.8125, "rewards/chosen": 0.736328125, "rewards/margins": 2.9765625, "rewards/rejected": -2.2421875, "step": 741 }, { "epoch": 0.8207964601769911, "grad_norm": 12.855173110961914, "learning_rate": 4.151205090835183e-08, "logits/chosen": -1.2734375, "logits/rejected": -1.10546875, "logps/chosen": -243.5, "logps/rejected": -283.0, "loss": 0.2732, "rewards/accuracies": 0.875, "rewards/chosen": 0.916015625, "rewards/margins": 3.0859375, "rewards/rejected": -2.1640625, "step": 742 }, { "epoch": 0.8219026548672567, "grad_norm": 15.404345512390137, "learning_rate": 4.1018630145340735e-08, "logits/chosen": -1.29296875, "logits/rejected": -1.34375, "logps/chosen": -255.0, "logps/rejected": -262.5, "loss": 0.2993, "rewards/accuracies": 0.828125, "rewards/chosen": 0.810546875, "rewards/margins": 3.09375, "rewards/rejected": -2.28125, "step": 743 }, { "epoch": 0.8230088495575221, "grad_norm": 13.697175979614258, "learning_rate": 4.0527897192786433e-08, "logits/chosen": -1.2421875, "logits/rejected": -1.171875, "logps/chosen": -279.0, "logps/rejected": -289.0, "loss": 0.2732, "rewards/accuracies": 0.84375, "rewards/chosen": 0.83984375, "rewards/margins": 3.046875, "rewards/rejected": -2.19921875, "step": 744 }, { "epoch": 0.8241150442477876, "grad_norm": 14.881061553955078, "learning_rate": 4.003985836224255e-08, "logits/chosen": -1.296875, "logits/rejected": -1.30859375, "logps/chosen": -256.5, "logps/rejected": -284.0, "loss": 0.3474, "rewards/accuracies": 0.796875, "rewards/chosen": 0.5068359375, "rewards/margins": 2.375, "rewards/rejected": -1.87109375, "step": 745 }, { "epoch": 0.8252212389380531, "grad_norm": 13.085796356201172, "learning_rate": 3.955451993061268e-08, "logits/chosen": -1.33984375, "logits/rejected": -1.12890625, "logps/chosen": -258.0, "logps/rejected": -292.0, "loss": 0.2616, "rewards/accuracies": 0.875, "rewards/chosen": 0.91796875, "rewards/margins": 3.09375, "rewards/rejected": -2.1796875, "step": 746 }, { "epoch": 0.8263274336283186, "grad_norm": 13.392922401428223, "learning_rate": 3.9071888140068926e-08, "logits/chosen": -1.2109375, "logits/rejected": -1.16796875, "logps/chosen": -256.0, "logps/rejected": -316.0, "loss": 0.2815, "rewards/accuracies": 0.859375, "rewards/chosen": 0.986328125, "rewards/margins": 3.1171875, "rewards/rejected": -2.1328125, "step": 747 }, { "epoch": 0.827433628318584, "grad_norm": 12.065234184265137, "learning_rate": 3.859196919797228e-08, "logits/chosen": -1.3359375, "logits/rejected": -1.12109375, "logps/chosen": -247.5, "logps/rejected": -264.0, "loss": 0.3147, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.7265625, "rewards/margins": 2.9609375, "rewards/rejected": -2.2421875, "step": 748 }, { "epoch": 0.8285398230088495, "grad_norm": 14.233034133911133, "learning_rate": 3.811476927679227e-08, "logits/chosen": -1.16015625, "logits/rejected": -1.1640625, "logps/chosen": -265.0, "logps/rejected": -300.0, "loss": 0.3261, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.6884765625, "rewards/margins": 2.75, "rewards/rejected": -2.0703125, "step": 749 }, { "epoch": 0.8296460176991151, "grad_norm": 14.785301208496094, "learning_rate": 3.764029451402778e-08, "logits/chosen": -1.265625, "logits/rejected": -1.19140625, "logps/chosen": -236.0, "logps/rejected": -282.0, "loss": 0.297, "rewards/accuracies": 0.84375, "rewards/chosen": 0.791015625, "rewards/margins": 3.1640625, "rewards/rejected": -2.375, "step": 750 }, { "epoch": 0.8296460176991151, "eval_logits/chosen": -1.270017147064209, "eval_logits/rejected": -1.1617498397827148, "eval_logps/chosen": -253.53233337402344, "eval_logps/rejected": -280.19403076171875, "eval_loss": 0.31980380415916443, "eval_rewards/accuracies": 0.8147646188735962, "eval_rewards/chosen": 0.72982257604599, "eval_rewards/margins": 2.827347755432129, "eval_rewards/rejected": -2.09759783744812, "eval_runtime": 193.0983, "eval_samples_per_second": 66.562, "eval_steps_per_second": 1.041, "step": 750 }, { "epoch": 0.8307522123893806, "grad_norm": 13.682051658630371, "learning_rate": 3.716855101212826e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.1953125, "logps/chosen": -270.0, "logps/rejected": -284.5, "loss": 0.3091, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.740234375, "rewards/margins": 2.8125, "rewards/rejected": -2.0703125, "step": 751 }, { "epoch": 0.831858407079646, "grad_norm": 14.422385215759277, "learning_rate": 3.6699544838415034e-08, "logits/chosen": -1.328125, "logits/rejected": -1.1015625, "logps/chosen": -268.0, "logps/rejected": -266.0, "loss": 0.3043, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.724609375, "rewards/margins": 2.875, "rewards/rejected": -2.15234375, "step": 752 }, { "epoch": 0.8329646017699115, "grad_norm": 14.678279876708984, "learning_rate": 3.623328202500322e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.13671875, "logps/chosen": -280.0, "logps/rejected": -305.0, "loss": 0.3304, "rewards/accuracies": 0.828125, "rewards/chosen": 0.6796875, "rewards/margins": 2.4765625, "rewards/rejected": -1.796875, "step": 753 }, { "epoch": 0.834070796460177, "grad_norm": 12.621984481811523, "learning_rate": 3.576976856872438e-08, "logits/chosen": -1.421875, "logits/rejected": -1.0859375, "logps/chosen": -252.0, "logps/rejected": -276.0, "loss": 0.294, "rewards/accuracies": 0.84375, "rewards/chosen": 0.73046875, "rewards/margins": 2.8515625, "rewards/rejected": -2.1171875, "step": 754 }, { "epoch": 0.8351769911504425, "grad_norm": 11.676498413085938, "learning_rate": 3.530901043104928e-08, "logits/chosen": -1.26171875, "logits/rejected": -1.1953125, "logps/chosen": -227.0, "logps/rejected": -266.0, "loss": 0.2778, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.048828125, "rewards/margins": 3.109375, "rewards/rejected": -2.0625, "step": 755 }, { "epoch": 0.8362831858407079, "grad_norm": 12.77115249633789, "learning_rate": 3.4851013538011035e-08, "logits/chosen": -1.3125, "logits/rejected": -1.17578125, "logps/chosen": -252.5, "logps/rejected": -290.0, "loss": 0.2771, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.7578125, "rewards/margins": 3.0078125, "rewards/rejected": -2.2578125, "step": 756 }, { "epoch": 0.8373893805309734, "grad_norm": 13.537567138671875, "learning_rate": 3.439578378012925e-08, "logits/chosen": -1.34765625, "logits/rejected": -1.17578125, "logps/chosen": -251.0, "logps/rejected": -285.5, "loss": 0.2978, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9921875, "rewards/margins": 3.140625, "rewards/rejected": -2.1484375, "step": 757 }, { "epoch": 0.838495575221239, "grad_norm": 12.62022590637207, "learning_rate": 3.394332701233391e-08, "logits/chosen": -1.265625, "logits/rejected": -1.1796875, "logps/chosen": -242.5, "logps/rejected": -261.0, "loss": 0.2755, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.8359375, "rewards/margins": 3.0078125, "rewards/rejected": -2.1640625, "step": 758 }, { "epoch": 0.8396017699115044, "grad_norm": 14.283227920532227, "learning_rate": 3.349364905389032e-08, "logits/chosen": -1.15234375, "logits/rejected": -1.158203125, "logps/chosen": -269.0, "logps/rejected": -290.0, "loss": 0.3305, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.658203125, "rewards/margins": 2.8828125, "rewards/rejected": -2.2265625, "step": 759 }, { "epoch": 0.8407079646017699, "grad_norm": 12.961087226867676, "learning_rate": 3.304675568832427e-08, "logits/chosen": -1.265625, "logits/rejected": -1.109375, "logps/chosen": -263.0, "logps/rejected": -279.5, "loss": 0.3033, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.755859375, "rewards/margins": 2.5078125, "rewards/rejected": -1.75, "step": 760 }, { "epoch": 0.8418141592920354, "grad_norm": 14.768875122070312, "learning_rate": 3.260265266334725e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.125, "logps/chosen": -256.0, "logps/rejected": -282.0, "loss": 0.382, "rewards/accuracies": 0.796875, "rewards/chosen": 0.765625, "rewards/margins": 2.40234375, "rewards/rejected": -1.63671875, "step": 761 }, { "epoch": 0.8429203539823009, "grad_norm": 16.72699546813965, "learning_rate": 3.216134569078316e-08, "logits/chosen": -1.25, "logits/rejected": -1.22265625, "logps/chosen": -266.0, "logps/rejected": -300.0, "loss": 0.3642, "rewards/accuracies": 0.765625, "rewards/chosen": 0.90625, "rewards/margins": 2.7421875, "rewards/rejected": -1.83203125, "step": 762 }, { "epoch": 0.8440265486725663, "grad_norm": 12.911907196044922, "learning_rate": 3.172284044649437e-08, "logits/chosen": -1.265625, "logits/rejected": -1.1171875, "logps/chosen": -260.5, "logps/rejected": -308.0, "loss": 0.3017, "rewards/accuracies": 0.828125, "rewards/chosen": 0.6640625, "rewards/margins": 2.6640625, "rewards/rejected": -2.0, "step": 763 }, { "epoch": 0.8451327433628318, "grad_norm": 15.997196197509766, "learning_rate": 3.128714257030882e-08, "logits/chosen": -1.27734375, "logits/rejected": -1.15625, "logps/chosen": -284.0, "logps/rejected": -301.0, "loss": 0.3964, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.607421875, "rewards/margins": 2.3203125, "rewards/rejected": -1.71875, "step": 764 }, { "epoch": 0.8462389380530974, "grad_norm": 14.732622146606445, "learning_rate": 3.085425766594768e-08, "logits/chosen": -1.2265625, "logits/rejected": -1.23828125, "logps/chosen": -262.0, "logps/rejected": -245.0, "loss": 0.3107, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.724609375, "rewards/margins": 2.9375, "rewards/rejected": -2.21875, "step": 765 }, { "epoch": 0.8473451327433629, "grad_norm": 14.123418807983398, "learning_rate": 3.042419130095292e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.19140625, "logps/chosen": -249.0, "logps/rejected": -303.0, "loss": 0.2951, "rewards/accuracies": 0.796875, "rewards/chosen": 0.73828125, "rewards/margins": 3.2109375, "rewards/rejected": -2.4765625, "step": 766 }, { "epoch": 0.8484513274336283, "grad_norm": 15.25007438659668, "learning_rate": 2.999694900661609e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.12109375, "logps/chosen": -280.0, "logps/rejected": -286.0, "loss": 0.3976, "rewards/accuracies": 0.75, "rewards/chosen": 0.296875, "rewards/margins": 1.96484375, "rewards/rejected": -1.66796875, "step": 767 }, { "epoch": 0.8495575221238938, "grad_norm": 14.552936553955078, "learning_rate": 2.9572536277906984e-08, "logits/chosen": -1.2421875, "logits/rejected": -1.15234375, "logps/chosen": -251.5, "logps/rejected": -294.0, "loss": 0.3292, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.5849609375, "rewards/margins": 2.734375, "rewards/rejected": -2.1484375, "step": 768 }, { "epoch": 0.8506637168141593, "grad_norm": 12.925614356994629, "learning_rate": 2.9150958573402885e-08, "logits/chosen": -1.3828125, "logits/rejected": -1.171875, "logps/chosen": -267.0, "logps/rejected": -305.0, "loss": 0.2762, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.775390625, "rewards/margins": 2.8984375, "rewards/rejected": -2.125, "step": 769 }, { "epoch": 0.8517699115044248, "grad_norm": 14.300766944885254, "learning_rate": 2.8732221315218573e-08, "logits/chosen": -1.18359375, "logits/rejected": -1.12109375, "logps/chosen": -257.0, "logps/rejected": -279.0, "loss": 0.344, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.52587890625, "rewards/margins": 2.484375, "rewards/rejected": -1.953125, "step": 770 }, { "epoch": 0.8528761061946902, "grad_norm": 13.133272171020508, "learning_rate": 2.8316329888936315e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.1015625, "logps/chosen": -250.5, "logps/rejected": -269.0, "loss": 0.2487, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.927734375, "rewards/margins": 3.046875, "rewards/rejected": -2.12109375, "step": 771 }, { "epoch": 0.8539823008849557, "grad_norm": 12.045042991638184, "learning_rate": 2.7903289643537e-08, "logits/chosen": -1.34375, "logits/rejected": -1.1171875, "logps/chosen": -256.5, "logps/rejected": -267.5, "loss": 0.2765, "rewards/accuracies": 0.859375, "rewards/chosen": 0.76171875, "rewards/margins": 2.96875, "rewards/rejected": -2.2109375, "step": 772 }, { "epoch": 0.8550884955752213, "grad_norm": 12.052350044250488, "learning_rate": 2.7493105891330832e-08, "logits/chosen": -1.28125, "logits/rejected": -1.16015625, "logps/chosen": -240.0, "logps/rejected": -274.0, "loss": 0.2838, "rewards/accuracies": 0.84375, "rewards/chosen": 0.81640625, "rewards/margins": 3.1171875, "rewards/rejected": -2.296875, "step": 773 }, { "epoch": 0.8561946902654868, "grad_norm": 12.869089126586914, "learning_rate": 2.7085783907889514e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.1796875, "logps/chosen": -260.0, "logps/rejected": -274.0, "loss": 0.3115, "rewards/accuracies": 0.828125, "rewards/chosen": 0.6953125, "rewards/margins": 2.546875, "rewards/rejected": -1.8515625, "step": 774 }, { "epoch": 0.8573008849557522, "grad_norm": 13.210247993469238, "learning_rate": 2.6681328931977942e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.03125, "logps/chosen": -247.5, "logps/rejected": -286.0, "loss": 0.2939, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.896484375, "rewards/margins": 2.9765625, "rewards/rejected": -2.078125, "step": 775 }, { "epoch": 0.8584070796460177, "grad_norm": 13.413789749145508, "learning_rate": 2.6279746165487255e-08, "logits/chosen": -1.26171875, "logits/rejected": -1.171875, "logps/chosen": -267.0, "logps/rejected": -282.0, "loss": 0.3004, "rewards/accuracies": 0.828125, "rewards/chosen": 0.658203125, "rewards/margins": 2.6015625, "rewards/rejected": -1.94921875, "step": 776 }, { "epoch": 0.8595132743362832, "grad_norm": 13.01457691192627, "learning_rate": 2.5881040773367502e-08, "logits/chosen": -1.1875, "logits/rejected": -1.05859375, "logps/chosen": -240.0, "logps/rejected": -257.0, "loss": 0.3088, "rewards/accuracies": 0.8125, "rewards/chosen": 0.716796875, "rewards/margins": 3.03125, "rewards/rejected": -2.31640625, "step": 777 }, { "epoch": 0.8606194690265486, "grad_norm": 12.700637817382812, "learning_rate": 2.5485217883561616e-08, "logits/chosen": -1.30859375, "logits/rejected": -1.1484375, "logps/chosen": -248.5, "logps/rejected": -279.0, "loss": 0.2977, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.826171875, "rewards/margins": 2.9375, "rewards/rejected": -2.11328125, "step": 778 }, { "epoch": 0.8617256637168141, "grad_norm": 13.09081745147705, "learning_rate": 2.5092282586939183e-08, "logits/chosen": -1.3828125, "logits/rejected": -1.1328125, "logps/chosen": -272.0, "logps/rejected": -284.5, "loss": 0.2959, "rewards/accuracies": 0.828125, "rewards/chosen": 0.755859375, "rewards/margins": 2.7265625, "rewards/rejected": -1.96875, "step": 779 }, { "epoch": 0.8628318584070797, "grad_norm": 12.912965774536133, "learning_rate": 2.470223993723103e-08, "logits/chosen": -1.171875, "logits/rejected": -1.10546875, "logps/chosen": -259.0, "logps/rejected": -283.5, "loss": 0.304, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.73828125, "rewards/margins": 2.9609375, "rewards/rejected": -2.21875, "step": 780 }, { "epoch": 0.8639380530973452, "grad_norm": 13.398490905761719, "learning_rate": 2.4315094950964343e-08, "logits/chosen": -1.375, "logits/rejected": -1.1953125, "logps/chosen": -272.5, "logps/rejected": -278.5, "loss": 0.3286, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.61328125, "rewards/margins": 2.4140625, "rewards/rejected": -1.796875, "step": 781 }, { "epoch": 0.8650442477876106, "grad_norm": 13.045671463012695, "learning_rate": 2.393085260739794e-08, "logits/chosen": -1.36328125, "logits/rejected": -1.15625, "logps/chosen": -242.0, "logps/rejected": -263.5, "loss": 0.3228, "rewards/accuracies": 0.78125, "rewards/chosen": 0.80859375, "rewards/margins": 2.7421875, "rewards/rejected": -1.94140625, "step": 782 }, { "epoch": 0.8661504424778761, "grad_norm": 15.309684753417969, "learning_rate": 2.3549517848458435e-08, "logits/chosen": -1.26171875, "logits/rejected": -1.1328125, "logps/chosen": -279.0, "logps/rejected": -301.0, "loss": 0.3618, "rewards/accuracies": 0.75, "rewards/chosen": 0.724609375, "rewards/margins": 2.4765625, "rewards/rejected": -1.75, "step": 783 }, { "epoch": 0.8672566371681416, "grad_norm": 12.972829818725586, "learning_rate": 2.3171095578676637e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.1171875, "logps/chosen": -255.5, "logps/rejected": -305.0, "loss": 0.2948, "rewards/accuracies": 0.796875, "rewards/chosen": 0.84375, "rewards/margins": 2.9140625, "rewards/rejected": -2.078125, "step": 784 }, { "epoch": 0.8683628318584071, "grad_norm": 12.639619827270508, "learning_rate": 2.2795590665124263e-08, "logits/chosen": -1.2421875, "logits/rejected": -1.09375, "logps/chosen": -235.0, "logps/rejected": -269.5, "loss": 0.2619, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.7578125, "rewards/margins": 3.171875, "rewards/rejected": -2.421875, "step": 785 }, { "epoch": 0.8694690265486725, "grad_norm": 12.34381103515625, "learning_rate": 2.2423007937351634e-08, "logits/chosen": -1.1953125, "logits/rejected": -1.13671875, "logps/chosen": -254.0, "logps/rejected": -264.0, "loss": 0.2839, "rewards/accuracies": 0.84375, "rewards/chosen": 0.521484375, "rewards/margins": 2.8828125, "rewards/rejected": -2.359375, "step": 786 }, { "epoch": 0.870575221238938, "grad_norm": 13.31490707397461, "learning_rate": 2.205335218732543e-08, "logits/chosen": -1.2734375, "logits/rejected": -1.21875, "logps/chosen": -259.0, "logps/rejected": -280.0, "loss": 0.3176, "rewards/accuracies": 0.796875, "rewards/chosen": 0.666015625, "rewards/margins": 2.65625, "rewards/rejected": -1.9921875, "step": 787 }, { "epoch": 0.8716814159292036, "grad_norm": 14.77593994140625, "learning_rate": 2.1686628169366923e-08, "logits/chosen": -1.109375, "logits/rejected": -1.1171875, "logps/chosen": -266.0, "logps/rejected": -297.0, "loss": 0.3291, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.771484375, "rewards/margins": 2.8671875, "rewards/rejected": -2.1015625, "step": 788 }, { "epoch": 0.8727876106194691, "grad_norm": 12.58286190032959, "learning_rate": 2.1322840600091096e-08, "logits/chosen": -1.265625, "logits/rejected": -1.15625, "logps/chosen": -249.5, "logps/rejected": -260.0, "loss": 0.2995, "rewards/accuracies": 0.828125, "rewards/chosen": 0.3896484375, "rewards/margins": 2.6328125, "rewards/rejected": -2.2421875, "step": 789 }, { "epoch": 0.8738938053097345, "grad_norm": 13.99928092956543, "learning_rate": 2.0961994158345763e-08, "logits/chosen": -1.34765625, "logits/rejected": -1.0859375, "logps/chosen": -254.5, "logps/rejected": -263.5, "loss": 0.2972, "rewards/accuracies": 0.828125, "rewards/chosen": 0.662109375, "rewards/margins": 2.8359375, "rewards/rejected": -2.16796875, "step": 790 }, { "epoch": 0.875, "grad_norm": 11.941873550415039, "learning_rate": 2.0604093485151548e-08, "logits/chosen": -1.31640625, "logits/rejected": -1.1171875, "logps/chosen": -261.0, "logps/rejected": -270.0, "loss": 0.2886, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.818359375, "rewards/margins": 3.15625, "rewards/rejected": -2.3359375, "step": 791 }, { "epoch": 0.8761061946902655, "grad_norm": 17.870344161987305, "learning_rate": 2.0249143183642097e-08, "logits/chosen": -1.3984375, "logits/rejected": -1.1953125, "logps/chosen": -244.0, "logps/rejected": -268.0, "loss": 0.4293, "rewards/accuracies": 0.765625, "rewards/chosen": 0.4189453125, "rewards/margins": 2.26171875, "rewards/rejected": -1.83984375, "step": 792 }, { "epoch": 0.8772123893805309, "grad_norm": 12.3770112991333, "learning_rate": 1.989714781900484e-08, "logits/chosen": -1.3671875, "logits/rejected": -1.14453125, "logps/chosen": -264.0, "logps/rejected": -284.0, "loss": 0.2621, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.869140625, "rewards/margins": 3.03125, "rewards/rejected": -2.1640625, "step": 793 }, { "epoch": 0.8783185840707964, "grad_norm": 14.804219245910645, "learning_rate": 1.95481119184224e-08, "logits/chosen": -1.1953125, "logits/rejected": -1.140625, "logps/chosen": -245.0, "logps/rejected": -302.0, "loss": 0.3552, "rewards/accuracies": 0.78125, "rewards/chosen": 0.638671875, "rewards/margins": 2.6875, "rewards/rejected": -2.04296875, "step": 794 }, { "epoch": 0.879424778761062, "grad_norm": 13.518996238708496, "learning_rate": 1.9202039971014243e-08, "logits/chosen": -1.375, "logits/rejected": -1.20703125, "logps/chosen": -241.5, "logps/rejected": -263.0, "loss": 0.3375, "rewards/accuracies": 0.78125, "rewards/chosen": 0.69921875, "rewards/margins": 2.734375, "rewards/rejected": -2.03125, "step": 795 }, { "epoch": 0.8805309734513275, "grad_norm": 13.753449440002441, "learning_rate": 1.8858936427779137e-08, "logits/chosen": -1.21875, "logits/rejected": -1.12109375, "logps/chosen": -262.5, "logps/rejected": -289.0, "loss": 0.2857, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.75, "rewards/margins": 2.8203125, "rewards/rejected": -2.0703125, "step": 796 }, { "epoch": 0.8816371681415929, "grad_norm": 15.924559593200684, "learning_rate": 1.8518805701537548e-08, "logits/chosen": -1.25, "logits/rejected": -1.09765625, "logps/chosen": -253.5, "logps/rejected": -264.5, "loss": 0.3678, "rewards/accuracies": 0.75, "rewards/chosen": 0.671875, "rewards/margins": 2.640625, "rewards/rejected": -1.96484375, "step": 797 }, { "epoch": 0.8827433628318584, "grad_norm": 12.930898666381836, "learning_rate": 1.818165216687531e-08, "logits/chosen": -1.26171875, "logits/rejected": -1.1640625, "logps/chosen": -248.5, "logps/rejected": -258.0, "loss": 0.2994, "rewards/accuracies": 0.859375, "rewards/chosen": 0.732421875, "rewards/margins": 2.9296875, "rewards/rejected": -2.1953125, "step": 798 }, { "epoch": 0.8838495575221239, "grad_norm": 14.774980545043945, "learning_rate": 1.7847480160087025e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.125, "logps/chosen": -250.5, "logps/rejected": -290.0, "loss": 0.3111, "rewards/accuracies": 0.828125, "rewards/chosen": 0.724609375, "rewards/margins": 2.953125, "rewards/rejected": -2.234375, "step": 799 }, { "epoch": 0.8849557522123894, "grad_norm": 13.716545104980469, "learning_rate": 1.7516293979120523e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.1328125, "logps/chosen": -262.5, "logps/rejected": -267.0, "loss": 0.3317, "rewards/accuracies": 0.796875, "rewards/chosen": 0.5703125, "rewards/margins": 2.640625, "rewards/rejected": -2.0625, "step": 800 }, { "epoch": 0.8849557522123894, "eval_logits/chosen": -1.2699394226074219, "eval_logits/rejected": -1.161244511604309, "eval_logps/chosen": -253.7014923095703, "eval_logps/rejected": -280.33831787109375, "eval_loss": 0.31928393244743347, "eval_rewards/accuracies": 0.8145930171012878, "eval_rewards/chosen": 0.7206642627716064, "eval_rewards/margins": 2.836987018585205, "eval_rewards/rejected": -2.1172263622283936, "eval_runtime": 193.0847, "eval_samples_per_second": 66.567, "eval_steps_per_second": 1.041, "step": 800 }, { "epoch": 0.8860619469026548, "grad_norm": 14.249678611755371, "learning_rate": 1.7188097883521352e-08, "logits/chosen": -1.28515625, "logits/rejected": -1.171875, "logps/chosen": -248.5, "logps/rejected": -251.0, "loss": 0.2843, "rewards/accuracies": 0.84375, "rewards/chosen": 0.751953125, "rewards/margins": 2.984375, "rewards/rejected": -2.234375, "step": 801 }, { "epoch": 0.8871681415929203, "grad_norm": 12.044215202331543, "learning_rate": 1.6862896094378244e-08, "logits/chosen": -1.296875, "logits/rejected": -1.2578125, "logps/chosen": -236.0, "logps/rejected": -261.5, "loss": 0.2971, "rewards/accuracies": 0.828125, "rewards/chosen": 0.8515625, "rewards/margins": 3.1171875, "rewards/rejected": -2.265625, "step": 802 }, { "epoch": 0.8882743362831859, "grad_norm": 13.170328140258789, "learning_rate": 1.654069279426873e-08, "logits/chosen": -1.21484375, "logits/rejected": -1.125, "logps/chosen": -255.5, "logps/rejected": -300.0, "loss": 0.2789, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.017578125, "rewards/margins": 3.1328125, "rewards/rejected": -2.1171875, "step": 803 }, { "epoch": 0.8893805309734514, "grad_norm": 13.188612937927246, "learning_rate": 1.6221492127205166e-08, "logits/chosen": -1.26953125, "logits/rejected": -1.24609375, "logps/chosen": -269.0, "logps/rejected": -293.0, "loss": 0.2959, "rewards/accuracies": 0.828125, "rewards/chosen": 0.900390625, "rewards/margins": 2.8125, "rewards/rejected": -1.921875, "step": 804 }, { "epoch": 0.8904867256637168, "grad_norm": 17.259361267089844, "learning_rate": 1.5905298198581774e-08, "logits/chosen": -1.25, "logits/rejected": -1.171875, "logps/chosen": -271.0, "logps/rejected": -301.0, "loss": 0.3979, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.814453125, "rewards/margins": 2.4921875, "rewards/rejected": -1.6796875, "step": 805 }, { "epoch": 0.8915929203539823, "grad_norm": 13.314188957214355, "learning_rate": 1.5592115075121508e-08, "logits/chosen": -1.3203125, "logits/rejected": -1.15234375, "logps/chosen": -250.5, "logps/rejected": -294.0, "loss": 0.3297, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.822265625, "rewards/margins": 2.546875, "rewards/rejected": -1.7265625, "step": 806 }, { "epoch": 0.8926991150442478, "grad_norm": 12.087140083312988, "learning_rate": 1.5281946784824002e-08, "logits/chosen": -1.33203125, "logits/rejected": -1.15625, "logps/chosen": -250.0, "logps/rejected": -297.0, "loss": 0.2368, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.66015625, "rewards/margins": 3.1953125, "rewards/rejected": -2.5234375, "step": 807 }, { "epoch": 0.8938053097345132, "grad_norm": 15.906932830810547, "learning_rate": 1.4974797316913673e-08, "logits/chosen": -1.28125, "logits/rejected": -1.15625, "logps/chosen": -287.0, "logps/rejected": -302.0, "loss": 0.3282, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.671875, "rewards/margins": 2.625, "rewards/rejected": -1.9609375, "step": 808 }, { "epoch": 0.8949115044247787, "grad_norm": 13.53934383392334, "learning_rate": 1.4670670621788229e-08, "logits/chosen": -1.1328125, "logits/rejected": -1.1015625, "logps/chosen": -266.0, "logps/rejected": -293.0, "loss": 0.2885, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.05859375, "rewards/margins": 3.359375, "rewards/rejected": -2.3046875, "step": 809 }, { "epoch": 0.8960176991150443, "grad_norm": 13.705190658569336, "learning_rate": 1.4369570610968274e-08, "logits/chosen": -1.3515625, "logits/rejected": -1.17578125, "logps/chosen": -250.0, "logps/rejected": -271.0, "loss": 0.346, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.693359375, "rewards/margins": 2.6171875, "rewards/rejected": -1.92578125, "step": 810 }, { "epoch": 0.8971238938053098, "grad_norm": 12.113191604614258, "learning_rate": 1.4071501157046666e-08, "logits/chosen": -1.19921875, "logits/rejected": -1.1328125, "logps/chosen": -256.0, "logps/rejected": -274.5, "loss": 0.2523, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.84375, "rewards/margins": 3.1171875, "rewards/rejected": -2.27734375, "step": 811 }, { "epoch": 0.8982300884955752, "grad_norm": 13.494206428527832, "learning_rate": 1.3776466093638695e-08, "logits/chosen": -1.234375, "logits/rejected": -1.0546875, "logps/chosen": -241.0, "logps/rejected": -272.0, "loss": 0.2704, "rewards/accuracies": 0.875, "rewards/chosen": 0.990234375, "rewards/margins": 3.4140625, "rewards/rejected": -2.4296875, "step": 812 }, { "epoch": 0.8993362831858407, "grad_norm": 11.447568893432617, "learning_rate": 1.3484469215333082e-08, "logits/chosen": -1.34765625, "logits/rejected": -1.19921875, "logps/chosen": -252.5, "logps/rejected": -244.0, "loss": 0.257, "rewards/accuracies": 0.875, "rewards/chosen": 0.6953125, "rewards/margins": 3.109375, "rewards/rejected": -2.4140625, "step": 813 }, { "epoch": 0.9004424778761062, "grad_norm": 17.08716583251953, "learning_rate": 1.3195514277642817e-08, "logits/chosen": -1.41015625, "logits/rejected": -1.3125, "logps/chosen": -264.0, "logps/rejected": -257.5, "loss": 0.4052, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.4658203125, "rewards/margins": 2.4140625, "rewards/rejected": -1.94921875, "step": 814 }, { "epoch": 0.9015486725663717, "grad_norm": 12.849235534667969, "learning_rate": 1.2909604996957091e-08, "logits/chosen": -1.2890625, "logits/rejected": -1.14453125, "logps/chosen": -252.5, "logps/rejected": -279.0, "loss": 0.2986, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.75, "rewards/margins": 2.734375, "rewards/rejected": -1.98046875, "step": 815 }, { "epoch": 0.9026548672566371, "grad_norm": 16.0147647857666, "learning_rate": 1.2626745050493493e-08, "logits/chosen": -1.36328125, "logits/rejected": -1.23828125, "logps/chosen": -245.5, "logps/rejected": -296.0, "loss": 0.3544, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.650390625, "rewards/margins": 2.578125, "rewards/rejected": -1.93359375, "step": 816 }, { "epoch": 0.9037610619469026, "grad_norm": 12.192747116088867, "learning_rate": 1.234693807625048e-08, "logits/chosen": -1.234375, "logits/rejected": -1.21875, "logps/chosen": -256.0, "logps/rejected": -277.0, "loss": 0.2743, "rewards/accuracies": 0.859375, "rewards/chosen": 0.802734375, "rewards/margins": 3.2421875, "rewards/rejected": -2.4296875, "step": 817 }, { "epoch": 0.9048672566371682, "grad_norm": 19.156158447265625, "learning_rate": 1.2070187672960947e-08, "logits/chosen": -1.4296875, "logits/rejected": -1.125, "logps/chosen": -261.0, "logps/rejected": -283.0, "loss": 0.4435, "rewards/accuracies": 0.765625, "rewards/chosen": 0.70703125, "rewards/margins": 2.515625, "rewards/rejected": -1.8125, "step": 818 }, { "epoch": 0.9059734513274337, "grad_norm": 14.084782600402832, "learning_rate": 1.179649740004557e-08, "logits/chosen": -1.24609375, "logits/rejected": -1.0703125, "logps/chosen": -273.0, "logps/rejected": -272.5, "loss": 0.2877, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4453125, "rewards/margins": 2.546875, "rewards/rejected": -2.1015625, "step": 819 }, { "epoch": 0.9070796460176991, "grad_norm": 14.487624168395996, "learning_rate": 1.1525870777567393e-08, "logits/chosen": -1.234375, "logits/rejected": -1.16015625, "logps/chosen": -273.0, "logps/rejected": -278.0, "loss": 0.3505, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.5, "rewards/margins": 2.34375, "rewards/rejected": -1.83984375, "step": 820 }, { "epoch": 0.9081858407079646, "grad_norm": 13.851645469665527, "learning_rate": 1.1258311286186207e-08, "logits/chosen": -1.28125, "logits/rejected": -1.1015625, "logps/chosen": -244.0, "logps/rejected": -292.0, "loss": 0.2884, "rewards/accuracies": 0.828125, "rewards/chosen": 0.9921875, "rewards/margins": 3.2578125, "rewards/rejected": -2.265625, "step": 821 }, { "epoch": 0.9092920353982301, "grad_norm": 13.431646347045898, "learning_rate": 1.0993822367114047e-08, "logits/chosen": -1.359375, "logits/rejected": -1.1640625, "logps/chosen": -285.0, "logps/rejected": -291.0, "loss": 0.2858, "rewards/accuracies": 0.84375, "rewards/chosen": 0.703125, "rewards/margins": 2.6796875, "rewards/rejected": -1.96875, "step": 822 }, { "epoch": 0.9103982300884956, "grad_norm": 15.518174171447754, "learning_rate": 1.0732407422070794e-08, "logits/chosen": -1.3359375, "logits/rejected": -1.19140625, "logps/chosen": -230.5, "logps/rejected": -270.0, "loss": 0.3882, "rewards/accuracies": 0.765625, "rewards/chosen": 0.5517578125, "rewards/margins": 2.5546875, "rewards/rejected": -1.99609375, "step": 823 }, { "epoch": 0.911504424778761, "grad_norm": 12.571428298950195, "learning_rate": 1.0474069813240505e-08, "logits/chosen": -1.23046875, "logits/rejected": -1.1328125, "logps/chosen": -241.0, "logps/rejected": -290.0, "loss": 0.3193, "rewards/accuracies": 0.84375, "rewards/chosen": 0.53515625, "rewards/margins": 2.59375, "rewards/rejected": -2.0546875, "step": 824 }, { "epoch": 0.9126106194690266, "grad_norm": 14.974266052246094, "learning_rate": 1.021881286322801e-08, "logits/chosen": -1.2421875, "logits/rejected": -1.109375, "logps/chosen": -264.5, "logps/rejected": -283.0, "loss": 0.3549, "rewards/accuracies": 0.828125, "rewards/chosen": 0.5546875, "rewards/margins": 2.23828125, "rewards/rejected": -1.6796875, "step": 825 }, { "epoch": 0.9137168141592921, "grad_norm": 12.049909591674805, "learning_rate": 9.966639855016446e-09, "logits/chosen": -1.3984375, "logits/rejected": -1.203125, "logps/chosen": -238.0, "logps/rejected": -257.0, "loss": 0.2548, "rewards/accuracies": 0.890625, "rewards/chosen": 0.775390625, "rewards/margins": 3.203125, "rewards/rejected": -2.4296875, "step": 826 }, { "epoch": 0.9148230088495575, "grad_norm": 16.12934112548828, "learning_rate": 9.71755403192484e-09, "logits/chosen": -1.27734375, "logits/rejected": -1.12890625, "logps/chosen": -274.0, "logps/rejected": -281.0, "loss": 0.3717, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.437744140625, "rewards/margins": 2.4609375, "rewards/rejected": -2.0234375, "step": 827 }, { "epoch": 0.915929203539823, "grad_norm": 15.575227737426758, "learning_rate": 9.47155859756632e-09, "logits/chosen": -1.34765625, "logits/rejected": -1.2265625, "logps/chosen": -244.0, "logps/rejected": -277.0, "loss": 0.3755, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.701171875, "rewards/margins": 2.8046875, "rewards/rejected": -2.109375, "step": 828 }, { "epoch": 0.9170353982300885, "grad_norm": 13.580742835998535, "learning_rate": 9.228656715807249e-09, "logits/chosen": -1.2265625, "logits/rejected": -1.125, "logps/chosen": -264.0, "logps/rejected": -301.0, "loss": 0.2762, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.828125, "rewards/margins": 3.140625, "rewards/rejected": -2.3125, "step": 829 }, { "epoch": 0.918141592920354, "grad_norm": 14.229433059692383, "learning_rate": 8.988851510726092e-09, "logits/chosen": -1.375, "logits/rejected": -1.109375, "logps/chosen": -269.0, "logps/rejected": -276.0, "loss": 0.2769, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.5478515625, "rewards/margins": 2.8828125, "rewards/rejected": -2.3359375, "step": 830 }, { "epoch": 0.9192477876106194, "grad_norm": 12.20298957824707, "learning_rate": 8.752146066573597e-09, "logits/chosen": -1.171875, "logits/rejected": -1.1484375, "logps/chosen": -254.0, "logps/rejected": -291.0, "loss": 0.2699, "rewards/accuracies": 0.828125, "rewards/chosen": 0.734375, "rewards/margins": 2.8125, "rewards/rejected": -2.08203125, "step": 831 }, { "epoch": 0.9203539823008849, "grad_norm": 14.036704063415527, "learning_rate": 8.518543427732949e-09, "logits/chosen": -1.296875, "logits/rejected": -1.1171875, "logps/chosen": -265.0, "logps/rejected": -267.0, "loss": 0.3249, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.578125, "rewards/margins": 2.5625, "rewards/rejected": -1.98046875, "step": 832 }, { "epoch": 0.9214601769911505, "grad_norm": 12.48025131225586, "learning_rate": 8.288046598680627e-09, "logits/chosen": -1.234375, "logits/rejected": -1.12109375, "logps/chosen": -260.0, "logps/rejected": -268.0, "loss": 0.2814, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.66796875, "rewards/margins": 3.0859375, "rewards/rejected": -2.421875, "step": 833 }, { "epoch": 0.922566371681416, "grad_norm": 12.8703031539917, "learning_rate": 8.060658543947829e-09, "logits/chosen": -1.2890625, "logits/rejected": -1.30078125, "logps/chosen": -223.0, "logps/rejected": -258.5, "loss": 0.2808, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8125, "rewards/margins": 2.953125, "rewards/rejected": -2.1484375, "step": 834 }, { "epoch": 0.9236725663716814, "grad_norm": 13.693394660949707, "learning_rate": 7.836382188082302e-09, "logits/chosen": -1.234375, "logits/rejected": -1.203125, "logps/chosen": -264.0, "logps/rejected": -289.0, "loss": 0.2979, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.48828125, "rewards/margins": 2.7421875, "rewards/rejected": -2.2421875, "step": 835 }, { "epoch": 0.9247787610619469, "grad_norm": 12.683737754821777, "learning_rate": 7.61522041561069e-09, "logits/chosen": -1.30859375, "logits/rejected": -1.11328125, "logps/chosen": -246.0, "logps/rejected": -265.0, "loss": 0.2762, "rewards/accuracies": 0.84375, "rewards/chosen": 0.81640625, "rewards/margins": 2.859375, "rewards/rejected": -2.046875, "step": 836 }, { "epoch": 0.9258849557522124, "grad_norm": 15.07400894165039, "learning_rate": 7.397176071001543e-09, "logits/chosen": -1.35546875, "logits/rejected": -1.1796875, "logps/chosen": -251.0, "logps/rejected": -267.0, "loss": 0.3266, "rewards/accuracies": 0.8125, "rewards/chosen": 0.708984375, "rewards/margins": 2.6171875, "rewards/rejected": -1.90625, "step": 837 }, { "epoch": 0.9269911504424779, "grad_norm": 12.571556091308594, "learning_rate": 7.182251958628538e-09, "logits/chosen": -1.33984375, "logits/rejected": -1.21875, "logps/chosen": -236.5, "logps/rejected": -259.0, "loss": 0.2943, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.5966796875, "rewards/margins": 2.7109375, "rewards/rejected": -2.12109375, "step": 838 }, { "epoch": 0.9280973451327433, "grad_norm": 12.665489196777344, "learning_rate": 6.970450842734649e-09, "logits/chosen": -1.375, "logits/rejected": -1.171875, "logps/chosen": -260.0, "logps/rejected": -276.0, "loss": 0.2713, "rewards/accuracies": 0.875, "rewards/chosen": 0.7734375, "rewards/margins": 3.0078125, "rewards/rejected": -2.234375, "step": 839 }, { "epoch": 0.9292035398230089, "grad_norm": 15.426192283630371, "learning_rate": 6.761775447396506e-09, "logits/chosen": -1.26171875, "logits/rejected": -1.203125, "logps/chosen": -244.0, "logps/rejected": -297.0, "loss": 0.3234, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.791015625, "rewards/margins": 3.015625, "rewards/rejected": -2.234375, "step": 840 }, { "epoch": 0.9303097345132744, "grad_norm": 12.329756736755371, "learning_rate": 6.556228456489232e-09, "logits/chosen": -1.1875, "logits/rejected": -1.0859375, "logps/chosen": -253.5, "logps/rejected": -280.0, "loss": 0.2926, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.6484375, "rewards/margins": 2.9453125, "rewards/rejected": -2.296875, "step": 841 }, { "epoch": 0.9314159292035398, "grad_norm": 16.28516387939453, "learning_rate": 6.353812513652052e-09, "logits/chosen": -1.2109375, "logits/rejected": -1.10546875, "logps/chosen": -260.0, "logps/rejected": -282.0, "loss": 0.3844, "rewards/accuracies": 0.7578125, "rewards/chosen": 0.529296875, "rewards/margins": 2.4453125, "rewards/rejected": -1.9140625, "step": 842 }, { "epoch": 0.9325221238938053, "grad_norm": 16.69934844970703, "learning_rate": 6.154530222254372e-09, "logits/chosen": -1.25390625, "logits/rejected": -1.21875, "logps/chosen": -245.5, "logps/rejected": -280.0, "loss": 0.3776, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.6630859375, "rewards/margins": 2.53125, "rewards/rejected": -1.8671875, "step": 843 }, { "epoch": 0.9336283185840708, "grad_norm": 15.312355995178223, "learning_rate": 5.958384145362038e-09, "logits/chosen": -1.27734375, "logits/rejected": -1.1796875, "logps/chosen": -267.5, "logps/rejected": -304.0, "loss": 0.3446, "rewards/accuracies": 0.828125, "rewards/chosen": 0.619140625, "rewards/margins": 2.7265625, "rewards/rejected": -2.109375, "step": 844 }, { "epoch": 0.9347345132743363, "grad_norm": 13.851134300231934, "learning_rate": 5.765376805704575e-09, "logits/chosen": -1.296875, "logits/rejected": -1.16015625, "logps/chosen": -242.5, "logps/rejected": -286.0, "loss": 0.312, "rewards/accuracies": 0.8125, "rewards/chosen": 0.826171875, "rewards/margins": 2.78125, "rewards/rejected": -1.953125, "step": 845 }, { "epoch": 0.9358407079646017, "grad_norm": 12.364534378051758, "learning_rate": 5.575510685642798e-09, "logits/chosen": -1.1328125, "logits/rejected": -1.1875, "logps/chosen": -265.0, "logps/rejected": -298.0, "loss": 0.2532, "rewards/accuracies": 0.8671875, "rewards/chosen": 1.015625, "rewards/margins": 3.3671875, "rewards/rejected": -2.3515625, "step": 846 }, { "epoch": 0.9369469026548672, "grad_norm": 15.209588050842285, "learning_rate": 5.38878822713662e-09, "logits/chosen": -1.25390625, "logits/rejected": -1.10546875, "logps/chosen": -279.0, "logps/rejected": -300.0, "loss": 0.3528, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.505859375, "rewards/margins": 2.6796875, "rewards/rejected": -2.1796875, "step": 847 }, { "epoch": 0.9380530973451328, "grad_norm": 13.730789184570312, "learning_rate": 5.205211831713935e-09, "logits/chosen": -1.37109375, "logits/rejected": -1.125, "logps/chosen": -239.5, "logps/rejected": -240.0, "loss": 0.3282, "rewards/accuracies": 0.796875, "rewards/chosen": 0.6494140625, "rewards/margins": 2.921875, "rewards/rejected": -2.2734375, "step": 848 }, { "epoch": 0.9391592920353983, "grad_norm": 13.921919822692871, "learning_rate": 5.024783860439474e-09, "logits/chosen": -1.28125, "logits/rejected": -1.08203125, "logps/chosen": -228.0, "logps/rejected": -262.0, "loss": 0.3565, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.517578125, "rewards/margins": 2.5703125, "rewards/rejected": -2.046875, "step": 849 }, { "epoch": 0.9402654867256637, "grad_norm": 15.472764015197754, "learning_rate": 4.8475066338846685e-09, "logits/chosen": -1.3515625, "logits/rejected": -1.15234375, "logps/chosen": -252.0, "logps/rejected": -282.0, "loss": 0.3386, "rewards/accuracies": 0.78125, "rewards/chosen": 0.658203125, "rewards/margins": 2.8203125, "rewards/rejected": -2.1640625, "step": 850 }, { "epoch": 0.9402654867256637, "eval_logits/chosen": -1.2672574520111084, "eval_logits/rejected": -1.1583489179611206, "eval_logps/chosen": -253.592041015625, "eval_logps/rejected": -280.3034973144531, "eval_loss": 0.31901347637176514, "eval_rewards/accuracies": 0.8145152926445007, "eval_rewards/chosen": 0.7268248796463013, "eval_rewards/margins": 2.8418843746185303, "eval_rewards/rejected": -2.1149721145629883, "eval_runtime": 192.9475, "eval_samples_per_second": 66.614, "eval_steps_per_second": 1.042, "step": 850 }, { "epoch": 0.9413716814159292, "grad_norm": 13.594935417175293, "learning_rate": 4.673382432097667e-09, "logits/chosen": -1.3515625, "logits/rejected": -1.23046875, "logps/chosen": -256.0, "logps/rejected": -263.0, "loss": 0.3324, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.6171875, "rewards/margins": 2.7265625, "rewards/rejected": -2.1171875, "step": 851 }, { "epoch": 0.9424778761061947, "grad_norm": 14.526602745056152, "learning_rate": 4.5024134945740036e-09, "logits/chosen": -1.3203125, "logits/rejected": -1.22265625, "logps/chosen": -229.5, "logps/rejected": -244.5, "loss": 0.3492, "rewards/accuracies": 0.796875, "rewards/chosen": 0.546875, "rewards/margins": 2.6640625, "rewards/rejected": -2.109375, "step": 852 }, { "epoch": 0.9435840707964602, "grad_norm": 16.662525177001953, "learning_rate": 4.334602020227867e-09, "logits/chosen": -1.34375, "logits/rejected": -1.15625, "logps/chosen": -285.0, "logps/rejected": -290.0, "loss": 0.3672, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4638671875, "rewards/margins": 2.3984375, "rewards/rejected": -1.94140625, "step": 853 }, { "epoch": 0.9446902654867256, "grad_norm": 14.094331741333008, "learning_rate": 4.169950167363767e-09, "logits/chosen": -1.265625, "logits/rejected": -1.0625, "logps/chosen": -263.0, "logps/rejected": -297.0, "loss": 0.3088, "rewards/accuracies": 0.78125, "rewards/chosen": 0.84765625, "rewards/margins": 2.9375, "rewards/rejected": -2.09375, "step": 854 }, { "epoch": 0.9457964601769911, "grad_norm": 14.868205070495605, "learning_rate": 4.0084600536488265e-09, "logits/chosen": -1.38671875, "logits/rejected": -1.17578125, "logps/chosen": -238.0, "logps/rejected": -290.0, "loss": 0.3156, "rewards/accuracies": 0.828125, "rewards/chosen": 0.8359375, "rewards/margins": 2.8125, "rewards/rejected": -1.9765625, "step": 855 }, { "epoch": 0.9469026548672567, "grad_norm": 13.155553817749023, "learning_rate": 3.850133756085505e-09, "logits/chosen": -1.31640625, "logits/rejected": -1.15234375, "logps/chosen": -270.0, "logps/rejected": -290.0, "loss": 0.3135, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.7265625, "rewards/margins": 2.7890625, "rewards/rejected": -2.0625, "step": 856 }, { "epoch": 0.9480088495575221, "grad_norm": 13.842921257019043, "learning_rate": 3.694973310984839e-09, "logits/chosen": -1.359375, "logits/rejected": -1.15625, "logps/chosen": -258.0, "logps/rejected": -281.0, "loss": 0.3115, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.759765625, "rewards/margins": 2.6171875, "rewards/rejected": -1.859375, "step": 857 }, { "epoch": 0.9491150442477876, "grad_norm": 13.213567733764648, "learning_rate": 3.5429807139403524e-09, "logits/chosen": -1.22265625, "logits/rejected": -1.048828125, "logps/chosen": -243.0, "logps/rejected": -300.0, "loss": 0.2749, "rewards/accuracies": 0.859375, "rewards/chosen": 0.87890625, "rewards/margins": 3.2109375, "rewards/rejected": -2.328125, "step": 858 }, { "epoch": 0.9502212389380531, "grad_norm": 11.955760955810547, "learning_rate": 3.3941579198023816e-09, "logits/chosen": -1.484375, "logits/rejected": -1.13671875, "logps/chosen": -218.0, "logps/rejected": -260.0, "loss": 0.2961, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.8046875, "rewards/margins": 2.7421875, "rewards/rejected": -1.93359375, "step": 859 }, { "epoch": 0.9513274336283186, "grad_norm": 13.337422370910645, "learning_rate": 3.248506842652793e-09, "logits/chosen": -1.2578125, "logits/rejected": -1.12109375, "logps/chosen": -249.0, "logps/rejected": -309.0, "loss": 0.2853, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.8349609375, "rewards/margins": 3.078125, "rewards/rejected": -2.25, "step": 860 }, { "epoch": 0.952433628318584, "grad_norm": 12.912832260131836, "learning_rate": 3.106029355780582e-09, "logits/chosen": -1.234375, "logits/rejected": -1.15234375, "logps/chosen": -271.0, "logps/rejected": -282.0, "loss": 0.3052, "rewards/accuracies": 0.828125, "rewards/chosen": 0.447265625, "rewards/margins": 2.65625, "rewards/rejected": -2.20703125, "step": 861 }, { "epoch": 0.9535398230088495, "grad_norm": 14.942134857177734, "learning_rate": 2.9667272916575337e-09, "logits/chosen": -1.20703125, "logits/rejected": -1.04296875, "logps/chosen": -247.5, "logps/rejected": -279.0, "loss": 0.3356, "rewards/accuracies": 0.734375, "rewards/chosen": 0.69921875, "rewards/margins": 2.8671875, "rewards/rejected": -2.1640625, "step": 862 }, { "epoch": 0.9546460176991151, "grad_norm": 11.314682960510254, "learning_rate": 2.830602441914881e-09, "logits/chosen": -1.23828125, "logits/rejected": -1.1875, "logps/chosen": -263.0, "logps/rejected": -277.0, "loss": 0.2615, "rewards/accuracies": 0.828125, "rewards/chosen": 0.80859375, "rewards/margins": 3.1953125, "rewards/rejected": -2.390625, "step": 863 }, { "epoch": 0.9557522123893806, "grad_norm": 13.024490356445312, "learning_rate": 2.6976565573202102e-09, "logits/chosen": -1.23828125, "logits/rejected": -1.23046875, "logps/chosen": -249.0, "logps/rejected": -275.0, "loss": 0.2961, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.734375, "rewards/margins": 2.7890625, "rewards/rejected": -2.0546875, "step": 864 }, { "epoch": 0.956858407079646, "grad_norm": 13.718770980834961, "learning_rate": 2.5678913477547302e-09, "logits/chosen": -1.39453125, "logits/rejected": -1.1796875, "logps/chosen": -274.0, "logps/rejected": -312.0, "loss": 0.2869, "rewards/accuracies": 0.875, "rewards/chosen": 0.853515625, "rewards/margins": 2.7890625, "rewards/rejected": -1.9296875, "step": 865 }, { "epoch": 0.9579646017699115, "grad_norm": 13.562867164611816, "learning_rate": 2.441308482191623e-09, "logits/chosen": -1.12890625, "logits/rejected": -1.0078125, "logps/chosen": -252.5, "logps/rejected": -291.0, "loss": 0.3117, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.845703125, "rewards/margins": 2.8359375, "rewards/rejected": -1.98828125, "step": 866 }, { "epoch": 0.959070796460177, "grad_norm": 13.698179244995117, "learning_rate": 2.3179095886743384e-09, "logits/chosen": -1.2890625, "logits/rejected": -1.234375, "logps/chosen": -230.5, "logps/rejected": -266.5, "loss": 0.3103, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.9375, "rewards/margins": 3.1484375, "rewards/rejected": -2.20703125, "step": 867 }, { "epoch": 0.9601769911504425, "grad_norm": 13.498557090759277, "learning_rate": 2.1976962542956945e-09, "logits/chosen": -1.2890625, "logits/rejected": -1.22265625, "logps/chosen": -245.5, "logps/rejected": -281.0, "loss": 0.3036, "rewards/accuracies": 0.828125, "rewards/chosen": 0.935546875, "rewards/margins": 2.9453125, "rewards/rejected": -2.0, "step": 868 }, { "epoch": 0.9612831858407079, "grad_norm": 12.500775337219238, "learning_rate": 2.0806700251775055e-09, "logits/chosen": -1.296875, "logits/rejected": -1.171875, "logps/chosen": -232.5, "logps/rejected": -262.0, "loss": 0.2973, "rewards/accuracies": 0.8125, "rewards/chosen": 0.724609375, "rewards/margins": 2.953125, "rewards/rejected": -2.2265625, "step": 869 }, { "epoch": 0.9623893805309734, "grad_norm": 13.277873992919922, "learning_rate": 1.966832406450708e-09, "logits/chosen": -1.3828125, "logits/rejected": -1.15234375, "logps/chosen": -232.0, "logps/rejected": -260.0, "loss": 0.3434, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.5185546875, "rewards/margins": 2.3984375, "rewards/rejected": -1.87890625, "step": 870 }, { "epoch": 0.963495575221239, "grad_norm": 14.751419067382812, "learning_rate": 1.85618486223596e-09, "logits/chosen": -1.234375, "logits/rejected": -1.1484375, "logps/chosen": -269.0, "logps/rejected": -291.0, "loss": 0.3578, "rewards/accuracies": 0.828125, "rewards/chosen": 0.4072265625, "rewards/margins": 2.484375, "rewards/rejected": -2.07421875, "step": 871 }, { "epoch": 0.9646017699115044, "grad_norm": 16.29852294921875, "learning_rate": 1.748728815624878e-09, "logits/chosen": -1.328125, "logits/rejected": -1.109375, "logps/chosen": -273.0, "logps/rejected": -264.0, "loss": 0.3518, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.50927734375, "rewards/margins": 2.734375, "rewards/rejected": -2.2265625, "step": 872 }, { "epoch": 0.9657079646017699, "grad_norm": 16.783334732055664, "learning_rate": 1.6444656486615805e-09, "logits/chosen": -1.1328125, "logits/rejected": -1.07421875, "logps/chosen": -287.0, "logps/rejected": -307.0, "loss": 0.3656, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.478515625, "rewards/margins": 2.4765625, "rewards/rejected": -2.00390625, "step": 873 }, { "epoch": 0.9668141592920354, "grad_norm": 16.244199752807617, "learning_rate": 1.5433967023250894e-09, "logits/chosen": -1.37109375, "logits/rejected": -1.1015625, "logps/chosen": -275.0, "logps/rejected": -317.0, "loss": 0.3542, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.7353515625, "rewards/margins": 2.8359375, "rewards/rejected": -2.09375, "step": 874 }, { "epoch": 0.9679203539823009, "grad_norm": 13.663660049438477, "learning_rate": 1.4455232765120396e-09, "logits/chosen": -1.3359375, "logits/rejected": -1.22265625, "logps/chosen": -244.5, "logps/rejected": -268.0, "loss": 0.3567, "rewards/accuracies": 0.75, "rewards/chosen": 0.701171875, "rewards/margins": 2.625, "rewards/rejected": -1.921875, "step": 875 }, { "epoch": 0.9690265486725663, "grad_norm": 12.790926933288574, "learning_rate": 1.3508466300198306e-09, "logits/chosen": -1.4296875, "logits/rejected": -1.21875, "logps/chosen": -232.5, "logps/rejected": -262.0, "loss": 0.3053, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.685546875, "rewards/margins": 2.921875, "rewards/rejected": -2.234375, "step": 876 }, { "epoch": 0.9701327433628318, "grad_norm": 15.329063415527344, "learning_rate": 1.2593679805306401e-09, "logits/chosen": -1.20703125, "logits/rejected": -1.20703125, "logps/chosen": -254.5, "logps/rejected": -278.0, "loss": 0.3161, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.87890625, "rewards/margins": 2.9609375, "rewards/rejected": -2.07421875, "step": 877 }, { "epoch": 0.9712389380530974, "grad_norm": 15.826077461242676, "learning_rate": 1.1710885045956021e-09, "logits/chosen": -1.41015625, "logits/rejected": -1.26953125, "logps/chosen": -257.5, "logps/rejected": -281.0, "loss": 0.3719, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.478515625, "rewards/margins": 2.2734375, "rewards/rejected": -1.79296875, "step": 878 }, { "epoch": 0.9723451327433629, "grad_norm": 15.952284812927246, "learning_rate": 1.0860093376197642e-09, "logits/chosen": -1.28125, "logits/rejected": -1.05078125, "logps/chosen": -260.0, "logps/rejected": -289.0, "loss": 0.3437, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.73828125, "rewards/margins": 2.859375, "rewards/rejected": -2.1171875, "step": 879 }, { "epoch": 0.9734513274336283, "grad_norm": 13.334358215332031, "learning_rate": 1.0041315738474055e-09, "logits/chosen": -1.203125, "logits/rejected": -1.0859375, "logps/chosen": -261.5, "logps/rejected": -312.0, "loss": 0.2845, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.869140625, "rewards/margins": 3.125, "rewards/rejected": -2.265625, "step": 880 }, { "epoch": 0.9745575221238938, "grad_norm": 11.215107917785645, "learning_rate": 9.254562663480458e-10, "logits/chosen": -1.3125, "logits/rejected": -1.2265625, "logps/chosen": -241.5, "logps/rejected": -287.0, "loss": 0.2595, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.03125, "rewards/margins": 3.1484375, "rewards/rejected": -2.125, "step": 881 }, { "epoch": 0.9756637168141593, "grad_norm": 13.879293441772461, "learning_rate": 8.499844270028755e-10, "logits/chosen": -1.3046875, "logits/rejected": -1.10546875, "logps/chosen": -250.5, "logps/rejected": -267.5, "loss": 0.3143, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.494140625, "rewards/margins": 2.84375, "rewards/rejected": -2.3515625, "step": 882 }, { "epoch": 0.9767699115044248, "grad_norm": 425.59161376953125, "learning_rate": 7.777170264917365e-10, "logits/chosen": -1.2421875, "logits/rejected": -1.0703125, "logps/chosen": -260.0, "logps/rejected": -347.0, "loss": 0.4257, "rewards/accuracies": 0.84375, "rewards/chosen": 0.646484375, "rewards/margins": 2.890625, "rewards/rejected": -2.2421875, "step": 883 }, { "epoch": 0.9778761061946902, "grad_norm": 25.028003692626953, "learning_rate": 7.086549942805498e-10, "logits/chosen": -1.19921875, "logits/rejected": -1.11328125, "logps/chosen": -285.0, "logps/rejected": -282.0, "loss": 0.3772, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.56640625, "rewards/margins": 2.359375, "rewards/rejected": -1.79296875, "step": 884 }, { "epoch": 0.9789823008849557, "grad_norm": 15.059175491333008, "learning_rate": 6.427992186095744e-10, "logits/chosen": -1.28515625, "logits/rejected": -1.19140625, "logps/chosen": -228.0, "logps/rejected": -271.0, "loss": 0.3026, "rewards/accuracies": 0.8125, "rewards/chosen": 0.94921875, "rewards/margins": 3.140625, "rewards/rejected": -2.1953125, "step": 885 }, { "epoch": 0.9800884955752213, "grad_norm": 12.722869873046875, "learning_rate": 5.801505464817502e-10, "logits/chosen": -1.171875, "logits/rejected": -1.08984375, "logps/chosen": -244.0, "logps/rejected": -279.0, "loss": 0.3066, "rewards/accuracies": 0.828125, "rewards/chosen": 0.61328125, "rewards/margins": 2.6640625, "rewards/rejected": -2.0546875, "step": 886 }, { "epoch": 0.9811946902654868, "grad_norm": 13.964948654174805, "learning_rate": 5.207097836519569e-10, "logits/chosen": -1.2421875, "logits/rejected": -1.1171875, "logps/chosen": -249.5, "logps/rejected": -287.0, "loss": 0.3159, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8046875, "rewards/margins": 2.953125, "rewards/rejected": -2.1484375, "step": 887 }, { "epoch": 0.9823008849557522, "grad_norm": 13.418638229370117, "learning_rate": 4.644776946165774e-10, "logits/chosen": -1.2734375, "logits/rejected": -1.1484375, "logps/chosen": -246.0, "logps/rejected": -253.5, "loss": 0.3351, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.580078125, "rewards/margins": 2.7734375, "rewards/rejected": -2.1953125, "step": 888 }, { "epoch": 0.9834070796460177, "grad_norm": 11.94414234161377, "learning_rate": 4.114550026037278e-10, "logits/chosen": -1.30078125, "logits/rejected": -1.10546875, "logps/chosen": -237.0, "logps/rejected": -285.0, "loss": 0.2559, "rewards/accuracies": 0.8671875, "rewards/chosen": 0.701171875, "rewards/margins": 3.1484375, "rewards/rejected": -2.453125, "step": 889 }, { "epoch": 0.9845132743362832, "grad_norm": 14.505678176879883, "learning_rate": 3.6164238956384876e-10, "logits/chosen": -1.21484375, "logits/rejected": -1.26953125, "logps/chosen": -248.5, "logps/rejected": -281.0, "loss": 0.2998, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.876953125, "rewards/margins": 2.875, "rewards/rejected": -2.00390625, "step": 890 }, { "epoch": 0.9856194690265486, "grad_norm": 12.155240058898926, "learning_rate": 3.150404961611008e-10, "logits/chosen": -1.234375, "logits/rejected": -1.140625, "logps/chosen": -240.0, "logps/rejected": -276.0, "loss": 0.2918, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.685546875, "rewards/margins": 3.0078125, "rewards/rejected": -2.3203125, "step": 891 }, { "epoch": 0.9867256637168141, "grad_norm": 13.752731323242188, "learning_rate": 2.716499217649271e-10, "logits/chosen": -1.2109375, "logits/rejected": -1.1640625, "logps/chosen": -241.5, "logps/rejected": -277.0, "loss": 0.3461, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.638671875, "rewards/margins": 2.6015625, "rewards/rejected": -1.9609375, "step": 892 }, { "epoch": 0.9878318584070797, "grad_norm": 14.712821960449219, "learning_rate": 2.3147122444250323e-10, "logits/chosen": -1.2265625, "logits/rejected": -1.15234375, "logps/chosen": -242.0, "logps/rejected": -274.0, "loss": 0.3957, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.4921875, "rewards/margins": 2.5234375, "rewards/rejected": -2.03125, "step": 893 }, { "epoch": 0.9889380530973452, "grad_norm": 13.806950569152832, "learning_rate": 1.9450492095149373e-10, "logits/chosen": -1.27734375, "logits/rejected": -1.06640625, "logps/chosen": -251.0, "logps/rejected": -282.0, "loss": 0.3152, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.84375, "rewards/margins": 2.9921875, "rewards/rejected": -2.1484375, "step": 894 }, { "epoch": 0.9900442477876106, "grad_norm": 13.336440086364746, "learning_rate": 1.607514867333626e-10, "logits/chosen": -1.17578125, "logits/rejected": -1.0546875, "logps/chosen": -273.5, "logps/rejected": -280.0, "loss": 0.3012, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.646484375, "rewards/margins": 2.7578125, "rewards/rejected": -2.1015625, "step": 895 }, { "epoch": 0.9911504424778761, "grad_norm": 14.591585159301758, "learning_rate": 1.3021135590740583e-10, "logits/chosen": -1.30078125, "logits/rejected": -1.10546875, "logps/chosen": -255.0, "logps/rejected": -281.0, "loss": 0.356, "rewards/accuracies": 0.796875, "rewards/chosen": 0.66015625, "rewards/margins": 2.4921875, "rewards/rejected": -1.83203125, "step": 896 }, { "epoch": 0.9922566371681416, "grad_norm": 14.33768367767334, "learning_rate": 1.028849212649785e-10, "logits/chosen": -1.33203125, "logits/rejected": -1.1875, "logps/chosen": -272.0, "logps/rejected": -264.0, "loss": 0.3197, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.716796875, "rewards/margins": 2.765625, "rewards/rejected": -2.046875, "step": 897 }, { "epoch": 0.9933628318584071, "grad_norm": 14.789177894592285, "learning_rate": 7.877253426458175e-11, "logits/chosen": -1.2890625, "logits/rejected": -1.125, "logps/chosen": -253.5, "logps/rejected": -296.0, "loss": 0.3679, "rewards/accuracies": 0.796875, "rewards/chosen": 0.57421875, "rewards/margins": 2.3984375, "rewards/rejected": -1.82421875, "step": 898 }, { "epoch": 0.9944690265486725, "grad_norm": 14.226619720458984, "learning_rate": 5.7874505027283304e-11, "logits/chosen": -1.265625, "logits/rejected": -1.12890625, "logps/chosen": -256.0, "logps/rejected": -263.5, "loss": 0.3177, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.6015625, "rewards/margins": 2.6796875, "rewards/rejected": -2.0703125, "step": 899 }, { "epoch": 0.995575221238938, "grad_norm": 14.491003036499023, "learning_rate": 4.0191102332748364e-11, "logits/chosen": -1.37109375, "logits/rejected": -1.2421875, "logps/chosen": -261.0, "logps/rejected": -300.0, "loss": 0.2955, "rewards/accuracies": 0.8125, "rewards/chosen": 0.892578125, "rewards/margins": 2.9140625, "rewards/rejected": -2.0234375, "step": 900 }, { "epoch": 0.995575221238938, "eval_logits/chosen": -1.2664412260055542, "eval_logits/rejected": -1.1579796075820923, "eval_logps/chosen": -253.59701538085938, "eval_logps/rejected": -280.3333435058594, "eval_loss": 0.31904885172843933, "eval_rewards/accuracies": 0.8163970708847046, "eval_rewards/chosen": 0.7264896035194397, "eval_rewards/margins": 2.841573476791382, "eval_rewards/rejected": -2.1163711547851562, "eval_runtime": 193.0253, "eval_samples_per_second": 66.587, "eval_steps_per_second": 1.041, "step": 900 }, { "epoch": 0.9966814159292036, "grad_norm": 11.916271209716797, "learning_rate": 2.5722553615770137e-11, "logits/chosen": -1.31640625, "logits/rejected": -1.11328125, "logps/chosen": -246.0, "logps/rejected": -269.5, "loss": 0.2564, "rewards/accuracies": 0.859375, "rewards/chosen": 0.962890625, "rewards/margins": 3.4609375, "rewards/rejected": -2.4921875, "step": 901 }, { "epoch": 0.9977876106194691, "grad_norm": 15.206621170043945, "learning_rate": 1.4469044963355547e-11, "logits/chosen": -1.2109375, "logits/rejected": -1.08984375, "logps/chosen": -250.5, "logps/rejected": -298.0, "loss": 0.3246, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.677734375, "rewards/margins": 2.6953125, "rewards/rejected": -2.015625, "step": 902 }, { "epoch": 0.9988938053097345, "grad_norm": 16.438447952270508, "learning_rate": 6.430721112282711e-12, "logits/chosen": -1.265625, "logits/rejected": -1.19921875, "logps/chosen": -255.5, "logps/rejected": -294.0, "loss": 0.4007, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6181640625, "rewards/margins": 2.5703125, "rewards/rejected": -1.953125, "step": 903 }, { "epoch": 1.0, "grad_norm": 13.59802532196045, "learning_rate": 1.6076854473801027e-12, "logits/chosen": -1.359375, "logits/rejected": -1.16796875, "logps/chosen": -269.0, "logps/rejected": -293.0, "loss": 0.2951, "rewards/accuracies": 0.875, "rewards/chosen": 0.6875, "rewards/margins": 2.75, "rewards/rejected": -2.0625, "step": 904 } ], "logging_steps": 1, "max_steps": 904, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "train_dataloader_state_dict": null, "trial_name": null, "trial_params": null }