diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,3433 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.22148394241417496,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "completion_length": 73.265625,
+      "epoch": 0.0011074197120708748,
+      "grad_norm": 0.5076314806938171,
+      "kl": 0.0,
+      "learning_rate": 9.99375e-07,
+      "loss": -0.018259915290400386,
+      "reward": 2.2648561000823975,
+      "reward_std": 0.32521533221006393,
+      "rewards/GDino": 0.84943026304245,
+      "rewards/GIT": 0.5776679813861847,
+      "rewards/HPSv2": 0.2639656066894531,
+      "rewards/ORM": 0.5737921893596649,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -22.0,
+      "step": 1
+    },
+    {
+      "completion_length": 56.0,
+      "epoch": 0.0022148394241417496,
+      "grad_norm": 0.5364330410957336,
+      "kl": 0.001522064208984375,
+      "learning_rate": 9.9875e-07,
+      "loss": 0.00348748016403988,
+      "reward": 1.7680926322937012,
+      "reward_std": 0.41801488399505615,
+      "rewards/GDino": 0.6529064476490021,
+      "rewards/GIT": 0.19494981318712234,
+      "rewards/HPSv2": 0.24983596801757812,
+      "rewards/ORM": 0.6704004406929016,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.0,
+      "step": 2
+    },
+    {
+      "completion_length": 55.4375,
+      "epoch": 0.0033222591362126247,
+      "grad_norm": 0.5614722967147827,
+      "kl": 0.001556396484375,
+      "learning_rate": 9.98125e-07,
+      "loss": 0.01565772108733654,
+      "reward": 1.6570448875427246,
+      "reward_std": 0.3965621292591095,
+      "rewards/GDino": 0.6382372081279755,
+      "rewards/GIT": 0.37795570492744446,
+      "rewards/HPSv2": 0.24709796905517578,
+      "rewards/ORM": 0.3937540017068386,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -20.9375,
+      "step": 3
+    },
+    {
+      "completion_length": 65.34375,
+      "epoch": 0.004429678848283499,
+      "grad_norm": 2.5736770629882812,
+      "kl": 0.0016021728515625,
+      "learning_rate": 9.975e-07,
+      "loss": -0.0012893765233457088,
+      "reward": 2.061529755592346,
+      "reward_std": 0.4106704443693161,
+      "rewards/GDino": 0.7796730995178223,
+      "rewards/GIT": 0.43717896938323975,
+      "rewards/HPSv2": 0.24744796752929688,
+      "rewards/ORM": 0.5972296595573425,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -22.0,
+      "step": 4
+    },
+    {
+      "completion_length": 63.578125,
+      "epoch": 0.005537098560354375,
+      "grad_norm": 0.48238250613212585,
+      "kl": 0.001575469970703125,
+      "learning_rate": 9.968749999999999e-07,
+      "loss": 0.020129199139773846,
+      "reward": 1.5302643775939941,
+      "reward_std": 0.44902199506759644,
+      "rewards/GDino": 0.6246840953826904,
+      "rewards/GIT": 0.23608428239822388,
+      "rewards/HPSv2": 0.2453451156616211,
+      "rewards/ORM": 0.42415088415145874,
+      "self_certainty_semantic": -25.625,
+      "self_certainty_token": -22.1875,
+      "step": 5
+    },
+    {
+      "completion_length": 60.65625,
+      "epoch": 0.006644518272425249,
+      "grad_norm": 0.8221905827522278,
+      "kl": 0.001674652099609375,
+      "learning_rate": 9.9625e-07,
+      "loss": 0.0192068200558424,
+      "reward": 2.1602972745895386,
+      "reward_std": 0.23134037852287292,
+      "rewards/GDino": 0.783700168132782,
+      "rewards/GIT": 0.452057421207428,
+      "rewards/HPSv2": 0.274627685546875,
+      "rewards/ORM": 0.6499120593070984,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -22.0,
+      "step": 6
+    },
+    {
+      "completion_length": 65.453125,
+      "epoch": 0.007751937984496124,
+      "grad_norm": 0.433403879404068,
+      "kl": 0.0016021728515625,
+      "learning_rate": 9.956249999999999e-07,
+      "loss": 0.028950304724276066,
+      "reward": 1.7097668647766113,
+      "reward_std": 0.5880981385707855,
+      "rewards/GDino": 0.5914062708616257,
+      "rewards/GIT": 0.15753822773694992,
+      "rewards/HPSv2": 0.25023555755615234,
+      "rewards/ORM": 0.7105867862701416,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.75,
+      "step": 7
+    },
+    {
+      "completion_length": 74.90625,
+      "epoch": 0.008859357696566999,
+      "grad_norm": 0.41245806217193604,
+      "kl": 0.00152587890625,
+      "learning_rate": 9.95e-07,
+      "loss": -0.016540683340281248,
+      "reward": 1.785366177558899,
+      "reward_std": 0.39637817442417145,
+      "rewards/GDino": 0.7011832594871521,
+      "rewards/GIT": 0.3848375529050827,
+      "rewards/HPSv2": 0.2445659637451172,
+      "rewards/ORM": 0.45477938652038574,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.875,
+      "step": 8
+    },
+    {
+      "completion_length": 61.828125,
+      "epoch": 0.009966777408637873,
+      "grad_norm": 0.3924250602722168,
+      "kl": 0.001617431640625,
+      "learning_rate": 9.94375e-07,
+      "loss": 0.03069412149488926,
+      "reward": 2.0813064575195312,
+      "reward_std": 0.5435488224029541,
+      "rewards/GDino": 0.736801415681839,
+      "rewards/GIT": 0.32275132089853287,
+      "rewards/HPSv2": 0.26233673095703125,
+      "rewards/ORM": 0.759416937828064,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -21.1875,
+      "step": 9
+    },
+    {
+      "completion_length": 62.796875,
+      "epoch": 0.01107419712070875,
+      "grad_norm": 0.5886948704719543,
+      "kl": 0.00164031982421875,
+      "learning_rate": 9.9375e-07,
+      "loss": -0.009089878294616938,
+      "reward": 1.8167259693145752,
+      "reward_std": 0.4427160769701004,
+      "rewards/GDino": 0.6997816860675812,
+      "rewards/GIT": 0.4742187559604645,
+      "rewards/HPSv2": 0.2480792999267578,
+      "rewards/ORM": 0.3946462571620941,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.125,
+      "step": 10
+    },
+    {
+      "completion_length": 64.09375,
+      "epoch": 0.012181616832779624,
+      "grad_norm": 0.6388463377952576,
+      "kl": 0.0016326904296875,
+      "learning_rate": 9.93125e-07,
+      "loss": -0.011163983959704638,
+      "reward": 2.250586152076721,
+      "reward_std": 0.29546695202589035,
+      "rewards/GDino": 0.7932291626930237,
+      "rewards/GIT": 0.5437096580862999,
+      "rewards/HPSv2": 0.25614356994628906,
+      "rewards/ORM": 0.657503753900528,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.0625,
+      "step": 11
+    },
+    {
+      "completion_length": 73.265625,
+      "epoch": 0.013289036544850499,
+      "grad_norm": 0.37963175773620605,
+      "kl": 0.001583099365234375,
+      "learning_rate": 9.925e-07,
+      "loss": 0.009535952471196651,
+      "reward": 1.8723560571670532,
+      "reward_std": 0.48824670910835266,
+      "rewards/GDino": 0.671429455280304,
+      "rewards/GIT": 0.4155814051628113,
+      "rewards/HPSv2": 0.2387409210205078,
+      "rewards/ORM": 0.5466042459011078,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.625,
+      "step": 12
+    },
+    {
+      "completion_length": 55.015625,
+      "epoch": 0.014396456256921373,
+      "grad_norm": 0.5844080448150635,
+      "kl": 0.001674652099609375,
+      "learning_rate": 9.91875e-07,
+      "loss": 0.0034986711107194424,
+      "reward": 1.7595484256744385,
+      "reward_std": 0.3697086051106453,
+      "rewards/GDino": 0.7100214958190918,
+      "rewards/GIT": 0.26869260519742966,
+      "rewards/HPSv2": 0.24958419799804688,
+      "rewards/ORM": 0.53125,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.375,
+      "step": 13
+    },
+    {
+      "completion_length": 55.65625,
+      "epoch": 0.015503875968992248,
+      "grad_norm": 0.5192797780036926,
+      "kl": 0.001674652099609375,
+      "learning_rate": 9.912499999999998e-07,
+      "loss": 0.010001872200518847,
+      "reward": 2.201015591621399,
+      "reward_std": 0.4899330288171768,
+      "rewards/GDino": 0.8140625059604645,
+      "rewards/GIT": 0.4328514188528061,
+      "rewards/HPSv2": 0.2431640625,
+      "rewards/ORM": 0.7109375,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -22.0,
+      "step": 14
+    },
+    {
+      "completion_length": 64.0,
+      "epoch": 0.016611295681063124,
+      "grad_norm": 0.46844616532325745,
+      "kl": 0.00174713134765625,
+      "learning_rate": 9.90625e-07,
+      "loss": 0.0017675042618066072,
+      "reward": 2.433342456817627,
+      "reward_std": 0.33736473321914673,
+      "rewards/GDino": 0.9153576791286469,
+      "rewards/GIT": 0.5124611556529999,
+      "rewards/HPSv2": 0.2507901191711426,
+      "rewards/ORM": 0.7547334432601929,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.6875,
+      "step": 15
+    },
+    {
+      "completion_length": 53.203125,
+      "epoch": 0.017718715393133997,
+      "grad_norm": 0.49579355120658875,
+      "kl": 0.001758575439453125,
+      "learning_rate": 9.9e-07,
+      "loss": 0.003856237977743149,
+      "reward": 1.6368815302848816,
+      "reward_std": 0.42226114869117737,
+      "rewards/GDino": 0.6432631015777588,
+      "rewards/GIT": 0.2906690910458565,
+      "rewards/HPSv2": 0.25169944763183594,
+      "rewards/ORM": 0.45124977827072144,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.8125,
+      "step": 16
+    },
+    {
+      "completion_length": 76.28125,
+      "epoch": 0.018826135105204873,
+      "grad_norm": 0.5296036601066589,
+      "kl": 0.001590728759765625,
+      "learning_rate": 9.89375e-07,
+      "loss": -0.003345506265759468,
+      "reward": 1.7861530184745789,
+      "reward_std": 0.5057752877473831,
+      "rewards/GDino": 0.6293700635433197,
+      "rewards/GIT": 0.2197464406490326,
+      "rewards/HPSv2": 0.26516151428222656,
+      "rewards/ORM": 0.671875,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -20.6875,
+      "step": 17
+    },
+    {
+      "completion_length": 58.0625,
+      "epoch": 0.019933554817275746,
+      "grad_norm": 0.6577962636947632,
+      "kl": 0.00174713134765625,
+      "learning_rate": 9.8875e-07,
+      "loss": -0.019500677473843098,
+      "reward": 2.303292393684387,
+      "reward_std": 0.2609405145049095,
+      "rewards/GDino": 0.8339102566242218,
+      "rewards/GIT": 0.5853700041770935,
+      "rewards/HPSv2": 0.24338722229003906,
+      "rewards/ORM": 0.640625,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.25,
+      "step": 18
+    },
+    {
+      "completion_length": 50.859375,
+      "epoch": 0.021040974529346623,
+      "grad_norm": 0.3543226718902588,
+      "kl": 0.00182342529296875,
+      "learning_rate": 9.88125e-07,
+      "loss": -0.00019507110118865967,
+      "reward": 1.6344053149223328,
+      "reward_std": 0.47374215722084045,
+      "rewards/GDino": 0.705148845911026,
+      "rewards/GIT": 0.2559727430343628,
+      "rewards/HPSv2": 0.2541370391845703,
+      "rewards/ORM": 0.41914665699005127,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -22.3125,
+      "step": 19
+    },
+    {
+      "completion_length": 65.921875,
+      "epoch": 0.0221483942414175,
+      "grad_norm": 0.5358290672302246,
+      "kl": 0.001781463623046875,
+      "learning_rate": 9.875e-07,
+      "loss": 0.007933363318443298,
+      "reward": 1.9504321217536926,
+      "reward_std": 0.3728322237730026,
+      "rewards/GDino": 0.6606760025024414,
+      "rewards/GIT": 0.48046815395355225,
+      "rewards/HPSv2": 0.24678802490234375,
+      "rewards/ORM": 0.5625,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.0625,
+      "step": 20
+    },
+    {
+      "completion_length": 59.3125,
+      "epoch": 0.023255813953488372,
+      "grad_norm": 2.0912797451019287,
+      "kl": 0.001811981201171875,
+      "learning_rate": 9.86875e-07,
+      "loss": -0.004398644436150789,
+      "reward": 2.252086877822876,
+      "reward_std": 0.44888848066329956,
+      "rewards/GDino": 0.798213005065918,
+      "rewards/GIT": 0.4853799045085907,
+      "rewards/HPSv2": 0.25956153869628906,
+      "rewards/ORM": 0.7089323997497559,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -22.75,
+      "step": 21
+    },
+    {
+      "completion_length": 52.265625,
+      "epoch": 0.024363233665559248,
+      "grad_norm": 0.5790585875511169,
+      "kl": 0.00191497802734375,
+      "learning_rate": 9.862499999999999e-07,
+      "loss": 0.006876260507851839,
+      "reward": 1.9933909177780151,
+      "reward_std": 0.32367050647735596,
+      "rewards/GDino": 0.7134387493133545,
+      "rewards/GIT": 0.41087181866168976,
+      "rewards/HPSv2": 0.2721214294433594,
+      "rewards/ORM": 0.5969589203596115,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -22.375,
+      "step": 22
+    },
+    {
+      "completion_length": 59.375,
+      "epoch": 0.02547065337763012,
+      "grad_norm": 0.45692723989486694,
+      "kl": 0.001697540283203125,
+      "learning_rate": 9.85625e-07,
+      "loss": -0.00792664848268032,
+      "reward": 2.015365242958069,
+      "reward_std": 0.48256243765354156,
+      "rewards/GDino": 0.724082350730896,
+      "rewards/GIT": 0.42729710042476654,
+      "rewards/HPSv2": 0.2667560577392578,
+      "rewards/ORM": 0.5972296893596649,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.6875,
+      "step": 23
+    },
+    {
+      "completion_length": 55.203125,
+      "epoch": 0.026578073089700997,
+      "grad_norm": 0.46439889073371887,
+      "kl": 0.0016937255859375,
+      "learning_rate": 9.849999999999999e-07,
+      "loss": 0.0024933242239058018,
+      "reward": 2.460409939289093,
+      "reward_std": 0.4443647414445877,
+      "rewards/GDino": 0.8454739451408386,
+      "rewards/GIT": 0.6258784532546997,
+      "rewards/HPSv2": 0.2624950408935547,
+      "rewards/ORM": 0.7265625,
+      "self_certainty_semantic": -25.6875,
+      "self_certainty_token": -21.25,
+      "step": 24
+    },
+    {
+      "completion_length": 60.15625,
+      "epoch": 0.02768549280177187,
+      "grad_norm": 0.47176027297973633,
+      "kl": 0.001880645751953125,
+      "learning_rate": 9.84375e-07,
+      "loss": 0.005812188144773245,
+      "reward": 2.0174233317375183,
+      "reward_std": 0.40724658966064453,
+      "rewards/GDino": 0.7186038792133331,
+      "rewards/GIT": 0.4156235605478287,
+      "rewards/HPSv2": 0.26485633850097656,
+      "rewards/ORM": 0.6183395236730576,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -22.0,
+      "step": 25
+    },
+    {
+      "completion_length": 53.21875,
+      "epoch": 0.028792912513842746,
+      "grad_norm": 0.716375470161438,
+      "kl": 0.00209808349609375,
+      "learning_rate": 9.8375e-07,
+      "loss": 0.02397427149116993,
+      "reward": 2.186239182949066,
+      "reward_std": 0.46710920333862305,
+      "rewards/GDino": 0.7593750059604645,
+      "rewards/GIT": 0.5171153843402863,
+      "rewards/HPSv2": 0.2734565734863281,
+      "rewards/ORM": 0.6362921893596649,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -23.0,
+      "step": 26
+    },
+    {
+      "completion_length": 58.421875,
+      "epoch": 0.029900332225913623,
+      "grad_norm": 0.428893119096756,
+      "kl": 0.00171661376953125,
+      "learning_rate": 9.83125e-07,
+      "loss": -0.005866332910954952,
+      "reward": 1.9681838750839233,
+      "reward_std": 0.3645169883966446,
+      "rewards/GDino": 0.7666666209697723,
+      "rewards/GIT": 0.4486802965402603,
+      "rewards/HPSv2": 0.2419452667236328,
+      "rewards/ORM": 0.5108915567398071,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.4375,
+      "step": 27
+    },
+    {
+      "completion_length": 63.328125,
+      "epoch": 0.031007751937984496,
+      "grad_norm": 0.5334203243255615,
+      "kl": 0.002010345458984375,
+      "learning_rate": 9.825e-07,
+      "loss": 0.012586410157382488,
+      "reward": 1.4134111404418945,
+      "reward_std": 0.3155324012041092,
+      "rewards/GDino": 0.6005972325801849,
+      "rewards/GIT": 0.11092349141836166,
+      "rewards/HPSv2": 0.2596569061279297,
+      "rewards/ORM": 0.44223344326019287,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.5625,
+      "step": 28
+    },
+    {
+      "completion_length": 56.25,
+      "epoch": 0.03211517165005537,
+      "grad_norm": 0.40832045674324036,
+      "kl": 0.001819610595703125,
+      "learning_rate": 9.81875e-07,
+      "loss": 0.010300841182470322,
+      "reward": 2.465680956840515,
+      "reward_std": 0.298002652823925,
+      "rewards/GDino": 0.862500011920929,
+      "rewards/GIT": 0.6107669174671173,
+      "rewards/HPSv2": 0.28375244140625,
+      "rewards/ORM": 0.7086615860462189,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.0625,
+      "step": 29
+    },
+    {
+      "completion_length": 54.953125,
+      "epoch": 0.03322259136212625,
+      "grad_norm": 0.4050670266151428,
+      "kl": 0.002025604248046875,
+      "learning_rate": 9.8125e-07,
+      "loss": -0.001845305785536766,
+      "reward": 2.476737856864929,
+      "reward_std": 0.3756887763738632,
+      "rewards/GDino": 0.8967152833938599,
+      "rewards/GIT": 0.551719531416893,
+      "rewards/HPSv2": 0.24522781372070312,
+      "rewards/ORM": 0.7830752730369568,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.25,
+      "step": 30
+    },
+    {
+      "completion_length": 74.84375,
+      "epoch": 0.03433001107419712,
+      "grad_norm": 0.7089686393737793,
+      "kl": 0.001865386962890625,
+      "learning_rate": 9.806249999999998e-07,
+      "loss": 0.023707949556410313,
+      "reward": 1.831493854522705,
+      "reward_std": 0.37860143184661865,
+      "rewards/GDino": 0.6287499666213989,
+      "rewards/GIT": 0.3833145350217819,
+      "rewards/HPSv2": 0.2413043975830078,
+      "rewards/ORM": 0.578125,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -20.9375,
+      "step": 31
+    },
+    {
+      "completion_length": 71.0625,
+      "epoch": 0.035437430786267994,
+      "grad_norm": 0.45204266905784607,
+      "kl": 0.00200653076171875,
+      "learning_rate": 9.8e-07,
+      "loss": 0.014695112593472004,
+      "reward": 1.5279032588005066,
+      "reward_std": 0.5042913109064102,
+      "rewards/GDino": 0.6702238023281097,
+      "rewards/GIT": 0.24817809462547302,
+      "rewards/HPSv2": 0.2356252670288086,
+      "rewards/ORM": 0.37387609481811523,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -22.5625,
+      "step": 32
+    },
+    {
+      "completion_length": 59.703125,
+      "epoch": 0.036544850498338874,
+      "grad_norm": 0.4359590411186218,
+      "kl": 0.00201416015625,
+      "learning_rate": 9.79375e-07,
+      "loss": 0.00610552029684186,
+      "reward": 2.3108657598495483,
+      "reward_std": 0.4415571391582489,
+      "rewards/GDino": 0.8515625,
+      "rewards/GIT": 0.6067334115505219,
+      "rewards/HPSv2": 0.22726917266845703,
+      "rewards/ORM": 0.6253007054328918,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.5,
+      "step": 33
+    },
+    {
+      "completion_length": 58.046875,
+      "epoch": 0.03765227021040975,
+      "grad_norm": 0.5853399038314819,
+      "kl": 0.002033233642578125,
+      "learning_rate": 9.7875e-07,
+      "loss": 0.023541483096778393,
+      "reward": 2.012690246105194,
+      "reward_std": 0.4660336524248123,
+      "rewards/GDino": 0.6989582777023315,
+      "rewards/GIT": 0.40700431168079376,
+      "rewards/HPSv2": 0.24774932861328125,
+      "rewards/ORM": 0.6589783728122711,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.8125,
+      "step": 34
+    },
+    {
+      "completion_length": 56.90625,
+      "epoch": 0.03875968992248062,
+      "grad_norm": 0.3787715435028076,
+      "kl": 0.001888275146484375,
+      "learning_rate": 9.78125e-07,
+      "loss": 0.003942073322832584,
+      "reward": 2.452033281326294,
+      "reward_std": 0.3410096764564514,
+      "rewards/GDino": 0.8359375298023224,
+      "rewards/GIT": 0.567652553319931,
+      "rewards/HPSv2": 0.2418804168701172,
+      "rewards/ORM": 0.806562751531601,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.3125,
+      "step": 35
+    },
+    {
+      "completion_length": 66.0,
+      "epoch": 0.03986710963455149,
+      "grad_norm": 0.5305721163749695,
+      "kl": 0.005157470703125,
+      "learning_rate": 9.775e-07,
+      "loss": -0.003781900042667985,
+      "reward": 1.8618011474609375,
+      "reward_std": 0.4120703786611557,
+      "rewards/GDino": 0.6453125476837158,
+      "rewards/GIT": 0.4281370937824249,
+      "rewards/HPSv2": 0.24621009826660156,
+      "rewards/ORM": 0.5421415567398071,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -20.9375,
+      "step": 36
+    },
+    {
+      "completion_length": 51.40625,
+      "epoch": 0.04097452934662237,
+      "grad_norm": 0.46515390276908875,
+      "kl": 0.002716064453125,
+      "learning_rate": 9.76875e-07,
+      "loss": 0.006902199704200029,
+      "reward": 1.9485998153686523,
+      "reward_std": 0.42147715389728546,
+      "rewards/GDino": 0.6951449513435364,
+      "rewards/GIT": 0.31057579815387726,
+      "rewards/HPSv2": 0.26158714294433594,
+      "rewards/ORM": 0.681291937828064,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -22.1875,
+      "step": 37
+    },
+    {
+      "completion_length": 71.03125,
+      "epoch": 0.042081949058693245,
+      "grad_norm": 0.951810896396637,
+      "kl": 0.00226593017578125,
+      "learning_rate": 9.7625e-07,
+      "loss": 0.03428783547133207,
+      "reward": 1.9112213850021362,
+      "reward_std": 0.30633312463760376,
+      "rewards/GDino": 0.7401995956897736,
+      "rewards/GIT": 0.30288365483283997,
+      "rewards/HPSv2": 0.2552833557128906,
+      "rewards/ORM": 0.6128547042608261,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.125,
+      "step": 38
+    },
+    {
+      "completion_length": 67.6875,
+      "epoch": 0.04318936877076412,
+      "grad_norm": 0.6357575058937073,
+      "kl": 0.01482391357421875,
+      "learning_rate": 9.756249999999999e-07,
+      "loss": 0.023865018505603075,
+      "reward": 2.345404624938965,
+      "reward_std": 0.31367097795009613,
+      "rewards/GDino": 0.8703815042972565,
+      "rewards/GIT": 0.4902418553829193,
+      "rewards/HPSv2": 0.26603126525878906,
+      "rewards/ORM": 0.71875,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -21.75,
+      "step": 39
+    },
+    {
+      "completion_length": 59.671875,
+      "epoch": 0.044296788482835,
+      "grad_norm": 0.5422465801239014,
+      "kl": 0.00281524658203125,
+      "learning_rate": 9.75e-07,
+      "loss": -0.018710695207118988,
+      "reward": 2.222834825515747,
+      "reward_std": 0.42842796444892883,
+      "rewards/GDino": 0.8634105622768402,
+      "rewards/GIT": 0.40908148139715195,
+      "rewards/HPSv2": 0.27498817443847656,
+      "rewards/ORM": 0.6753546893596649,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.125,
+      "step": 40
+    },
+    {
+      "completion_length": 60.9375,
+      "epoch": 0.04540420819490587,
+      "grad_norm": 0.7511593103408813,
+      "kl": 0.00299072265625,
+      "learning_rate": 9.743749999999999e-07,
+      "loss": 0.005782268475741148,
+      "reward": 1.8980144262313843,
+      "reward_std": 0.3208035007119179,
+      "rewards/GDino": 0.6784752607345581,
+      "rewards/GIT": 0.3914954513311386,
+      "rewards/HPSv2": 0.24643898010253906,
+      "rewards/ORM": 0.5816046595573425,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.25,
+      "step": 41
+    },
+    {
+      "completion_length": 48.4375,
+      "epoch": 0.046511627906976744,
+      "grad_norm": 0.5177002549171448,
+      "kl": 0.0025177001953125,
+      "learning_rate": 9.7375e-07,
+      "loss": 0.045526545494794846,
+      "reward": 2.269711136817932,
+      "reward_std": 0.48014624416828156,
+      "rewards/GDino": 0.8855312466621399,
+      "rewards/GIT": 0.4437972754240036,
+      "rewards/HPSv2": 0.2572154998779297,
+      "rewards/ORM": 0.6831671893596649,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.5625,
+      "step": 42
+    },
+    {
+      "completion_length": 67.875,
+      "epoch": 0.047619047619047616,
+      "grad_norm": 0.5885121822357178,
+      "kl": 0.002044677734375,
+      "learning_rate": 9.73125e-07,
+      "loss": 0.013573684729635715,
+      "reward": 1.6382005214691162,
+      "reward_std": 0.38919302821159363,
+      "rewards/GDino": 0.6114583313465118,
+      "rewards/GIT": 0.3806646466255188,
+      "rewards/HPSv2": 0.23286819458007812,
+      "rewards/ORM": 0.41320937871932983,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.0625,
+      "step": 43
+    },
+    {
+      "completion_length": 54.4375,
+      "epoch": 0.048726467331118496,
+      "grad_norm": 0.40727919340133667,
+      "kl": 0.0020751953125,
+      "learning_rate": 9.725e-07,
+      "loss": -0.01244093757122755,
+      "reward": 2.8831005096435547,
+      "reward_std": 0.31665875762701035,
+      "rewards/GDino": 0.9588541388511658,
+      "rewards/GIT": 0.7738310992717743,
+      "rewards/HPSv2": 0.2601909637451172,
+      "rewards/ORM": 0.8902243673801422,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -21.0625,
+      "step": 44
+    },
+    {
+      "completion_length": 54.90625,
+      "epoch": 0.04983388704318937,
+      "grad_norm": 0.4928445816040039,
+      "kl": 0.0024566650390625,
+      "learning_rate": 9.71875e-07,
+      "loss": 0.00010553281754255295,
+      "reward": 2.4343937635421753,
+      "reward_std": 0.5984751731157303,
+      "rewards/GDino": 0.862500011920929,
+      "rewards/GIT": 0.5139474421739578,
+      "rewards/HPSv2": 0.26379966735839844,
+      "rewards/ORM": 0.7941466569900513,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -21.25,
+      "step": 45
+    },
+    {
+      "completion_length": 56.078125,
+      "epoch": 0.05094130675526024,
+      "grad_norm": 0.37051326036453247,
+      "kl": 0.00231170654296875,
+      "learning_rate": 9.712499999999998e-07,
+      "loss": 0.007893505971878767,
+      "reward": 1.9575175046920776,
+      "reward_std": 0.3945648521184921,
+      "rewards/GDino": 0.5999999940395355,
+      "rewards/GIT": 0.32395021617412567,
+      "rewards/HPSv2": 0.26719093322753906,
+      "rewards/ORM": 0.7663763463497162,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -22.0625,
+      "step": 46
+    },
+    {
+      "completion_length": 55.171875,
+      "epoch": 0.05204872646733112,
+      "grad_norm": 0.8945181369781494,
+      "kl": 0.0025634765625,
+      "learning_rate": 9.70625e-07,
+      "loss": -0.0013387980870902538,
+      "reward": 1.836871862411499,
+      "reward_std": 0.23468619585037231,
+      "rewards/GDino": 0.7209739089012146,
+      "rewards/GIT": 0.22856376320123672,
+      "rewards/HPSv2": 0.27921295166015625,
+      "rewards/ORM": 0.608121246099472,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.6875,
+      "step": 47
+    },
+    {
+      "completion_length": 57.984375,
+      "epoch": 0.053156146179401995,
+      "grad_norm": 1.6689982414245605,
+      "kl": 0.00267791748046875,
+      "learning_rate": 9.7e-07,
+      "loss": 0.022647732868790627,
+      "reward": 1.454766035079956,
+      "reward_std": 0.40884387493133545,
+      "rewards/GDino": 0.6050891876220703,
+      "rewards/GIT": 0.0,
+      "rewards/HPSv2": 0.2698974609375,
+      "rewards/ORM": 0.5797793865203857,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.625,
+      "step": 48
+    },
+    {
+      "completion_length": 58.046875,
+      "epoch": 0.05426356589147287,
+      "grad_norm": 0.4761441648006439,
+      "kl": 0.002048492431640625,
+      "learning_rate": 9.69375e-07,
+      "loss": 0.016307475278154016,
+      "reward": 1.9066129326820374,
+      "reward_std": 0.5319462567567825,
+      "rewards/GDino": 0.7744874656200409,
+      "rewards/GIT": 0.2370736114680767,
+      "rewards/HPSv2": 0.2514495849609375,
+      "rewards/ORM": 0.6436022371053696,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.625,
+      "step": 49
+    },
+    {
+      "completion_length": 61.0,
+      "epoch": 0.05537098560354374,
+      "grad_norm": 0.8074173331260681,
+      "kl": 0.0040283203125,
+      "learning_rate": 9.6875e-07,
+      "loss": 0.005913220578804612,
+      "reward": 2.0915883779525757,
+      "reward_std": 0.5395111739635468,
+      "rewards/GDino": 0.7859093248844147,
+      "rewards/GIT": 0.3929952085018158,
+      "rewards/HPSv2": 0.25482940673828125,
+      "rewards/ORM": 0.657854437828064,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -22.25,
+      "step": 50
+    },
+    {
+      "completion_length": 44.359375,
+      "epoch": 0.05647840531561462,
+      "grad_norm": 0.5618427991867065,
+      "kl": 0.002471923828125,
+      "learning_rate": 9.68125e-07,
+      "loss": -0.003945098840631545,
+      "reward": 1.8058671951293945,
+      "reward_std": 0.5712144523859024,
+      "rewards/GDino": 0.7815796732902527,
+      "rewards/GIT": 0.2604931816458702,
+      "rewards/HPSv2": 0.27115440368652344,
+      "rewards/ORM": 0.49263995885849,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -22.625,
+      "step": 51
+    },
+    {
+      "completion_length": 48.0,
+      "epoch": 0.05758582502768549,
+      "grad_norm": 107.57159423828125,
+      "kl": 26.37615966796875,
+      "learning_rate": 9.675e-07,
+      "loss": 0.27801212295889854,
+      "reward": 2.4165316820144653,
+      "reward_std": 0.2998274937272072,
+      "rewards/GDino": 0.9244791567325592,
+      "rewards/GIT": 0.6574473828077316,
+      "rewards/HPSv2": 0.2756366729736328,
+      "rewards/ORM": 0.5589684545993805,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.375,
+      "step": 52
+    },
+    {
+      "completion_length": 52.140625,
+      "epoch": 0.058693244739756366,
+      "grad_norm": 0.4408358931541443,
+      "kl": 0.00232696533203125,
+      "learning_rate": 9.66875e-07,
+      "loss": 0.013528472045436502,
+      "reward": 1.8899905681610107,
+      "reward_std": 0.4558149725198746,
+      "rewards/GDino": 0.730059951543808,
+      "rewards/GIT": 0.39098620414733887,
+      "rewards/HPSv2": 0.24242782592773438,
+      "rewards/ORM": 0.5265165567398071,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -20.5625,
+      "step": 53
+    },
+    {
+      "completion_length": 58.390625,
+      "epoch": 0.059800664451827246,
+      "grad_norm": 0.48384228348731995,
+      "kl": 0.00225067138671875,
+      "learning_rate": 9.6625e-07,
+      "loss": 0.005568797350861132,
+      "reward": 1.638724684715271,
+      "reward_std": 0.41337575018405914,
+      "rewards/GDino": 0.6137361526489258,
+      "rewards/GIT": 0.24863167852163315,
+      "rewards/HPSv2": 0.24831581115722656,
+      "rewards/ORM": 0.5280410945415497,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -20.9375,
+      "step": 54
+    },
+    {
+      "completion_length": 50.234375,
+      "epoch": 0.06090808416389812,
+      "grad_norm": 0.46963369846343994,
+      "kl": 0.0026397705078125,
+      "learning_rate": 9.65625e-07,
+      "loss": 0.009267964400351048,
+      "reward": 1.7191376686096191,
+      "reward_std": 0.521537572145462,
+      "rewards/GDino": 0.7086881995201111,
+      "rewards/GIT": 0.3270767852663994,
+      "rewards/HPSv2": 0.2678356170654297,
+      "rewards/ORM": 0.4155370891094208,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.6875,
+      "step": 55
+    },
+    {
+      "completion_length": 59.953125,
+      "epoch": 0.06201550387596899,
+      "grad_norm": 0.6913841366767883,
+      "kl": 0.0024261474609375,
+      "learning_rate": 9.649999999999999e-07,
+      "loss": 0.03414425998926163,
+      "reward": 1.9336698055267334,
+      "reward_std": 0.45749759674072266,
+      "rewards/GDino": 0.6963726580142975,
+      "rewards/GIT": 0.38425514101982117,
+      "rewards/HPSv2": 0.2471466064453125,
+      "rewards/ORM": 0.6058953106403351,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.875,
+      "step": 56
+    },
+    {
+      "completion_length": 50.765625,
+      "epoch": 0.06312292358803986,
+      "grad_norm": 0.5066769123077393,
+      "kl": 0.002532958984375,
+      "learning_rate": 9.64375e-07,
+      "loss": 0.009842937346547842,
+      "reward": 1.8338811993598938,
+      "reward_std": 0.3951306492090225,
+      "rewards/GDino": 0.7909577786922455,
+      "rewards/GIT": 0.24781160056591034,
+      "rewards/HPSv2": 0.2509651184082031,
+      "rewards/ORM": 0.5441466569900513,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.4375,
+      "step": 57
+    },
+    {
+      "completion_length": 52.9375,
+      "epoch": 0.06423034330011074,
+      "grad_norm": 0.37791869044303894,
+      "kl": 0.002685546875,
+      "learning_rate": 9.637499999999999e-07,
+      "loss": 0.024126023054122925,
+      "reward": 1.8852884769439697,
+      "reward_std": 0.46756890416145325,
+      "rewards/GDino": 0.732811689376831,
+      "rewards/GIT": 0.38145140558481216,
+      "rewards/HPSv2": 0.2541465759277344,
+      "rewards/ORM": 0.5168787688016891,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.25,
+      "step": 58
+    },
+    {
+      "completion_length": 47.234375,
+      "epoch": 0.06533776301218161,
+      "grad_norm": 0.7410405278205872,
+      "kl": 0.0026092529296875,
+      "learning_rate": 9.63125e-07,
+      "loss": -0.01674468442797661,
+      "reward": 2.3462648391723633,
+      "reward_std": 0.2433818019926548,
+      "rewards/GDino": 0.8425607979297638,
+      "rewards/GIT": 0.46571947634220123,
+      "rewards/HPSv2": 0.2664222717285156,
+      "rewards/ORM": 0.771562248468399,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.25,
+      "step": 59
+    },
+    {
+      "completion_length": 45.015625,
+      "epoch": 0.0664451827242525,
+      "grad_norm": 0.5326105952262878,
+      "kl": 0.0026397705078125,
+      "learning_rate": 9.624999999999999e-07,
+      "loss": 0.003804182168096304,
+      "reward": 2.036432147026062,
+      "reward_std": 0.3990803211927414,
+      "rewards/GDino": 0.8798050284385681,
+      "rewards/GIT": 0.4744318723678589,
+      "rewards/HPSv2": 0.238006591796875,
+      "rewards/ORM": 0.44418865442276,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.6875,
+      "step": 60
+    },
+    {
+      "completion_length": 65.0,
+      "epoch": 0.06755260243632337,
+      "grad_norm": 0.5713196396827698,
+      "kl": 0.00235748291015625,
+      "learning_rate": 9.61875e-07,
+      "loss": 0.04368375800549984,
+      "reward": 2.1398236751556396,
+      "reward_std": 0.3530130609869957,
+      "rewards/GDino": 0.7138020694255829,
+      "rewards/GIT": 0.644903838634491,
+      "rewards/HPSv2": 0.2529468536376953,
+      "rewards/ORM": 0.5281709432601929,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -20.6875,
+      "step": 61
+    },
+    {
+      "completion_length": 54.9375,
+      "epoch": 0.06866002214839424,
+      "grad_norm": 5.612445831298828,
+      "kl": 0.00339508056640625,
+      "learning_rate": 9.6125e-07,
+      "loss": 0.008875304833054543,
+      "reward": 2.497900605201721,
+      "reward_std": 0.41675496101379395,
+      "rewards/GDino": 0.872697502374649,
+      "rewards/GIT": 0.601748138666153,
+      "rewards/HPSv2": 0.2640380859375,
+      "rewards/ORM": 0.759416937828064,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.875,
+      "step": 62
+    },
+    {
+      "completion_length": 49.734375,
+      "epoch": 0.06976744186046512,
+      "grad_norm": 0.5861217379570007,
+      "kl": 0.003326416015625,
+      "learning_rate": 9.606249999999998e-07,
+      "loss": 0.01025251136161387,
+      "reward": 2.2640050053596497,
+      "reward_std": 0.48744213581085205,
+      "rewards/GDino": 0.8172852694988251,
+      "rewards/GIT": 0.44742196798324585,
+      "rewards/HPSv2": 0.2430896759033203,
+      "rewards/ORM": 0.7562080323696136,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -22.1875,
+      "step": 63
+    },
+    {
+      "completion_length": 64.375,
+      "epoch": 0.07087486157253599,
+      "grad_norm": 0.39266109466552734,
+      "kl": 0.00298309326171875,
+      "learning_rate": 9.6e-07,
+      "loss": -0.005469436291605234,
+      "reward": 1.6910768747329712,
+      "reward_std": 0.2151722088456154,
+      "rewards/GDino": 0.7097718715667725,
+      "rewards/GIT": 0.32366037368774414,
+      "rewards/HPSv2": 0.2576026916503906,
+      "rewards/ORM": 0.40004195272922516,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.9375,
+      "step": 64
+    },
+    {
+      "completion_length": 61.515625,
+      "epoch": 0.07198228128460686,
+      "grad_norm": 0.705937922000885,
+      "kl": 0.002685546875,
+      "learning_rate": 9.59375e-07,
+      "loss": 0.010601098649203777,
+      "reward": 2.128853142261505,
+      "reward_std": 0.4351096749305725,
+      "rewards/GDino": 0.7197916805744171,
+      "rewards/GIT": 0.6168824732303619,
+      "rewards/HPSv2": 0.23163414001464844,
+      "rewards/ORM": 0.5605448335409164,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -22.75,
+      "step": 65
+    },
+    {
+      "completion_length": 49.0,
+      "epoch": 0.07308970099667775,
+      "grad_norm": 0.4427480101585388,
+      "kl": 0.002899169921875,
+      "learning_rate": 9.5875e-07,
+      "loss": 0.02646360918879509,
+      "reward": 2.1654986143112183,
+      "reward_std": 0.37753987312316895,
+      "rewards/GDino": 0.6895833611488342,
+      "rewards/GIT": 0.48387444019317627,
+      "rewards/HPSv2": 0.2579364776611328,
+      "rewards/ORM": 0.7341042160987854,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.5625,
+      "step": 66
+    },
+    {
+      "completion_length": 63.140625,
+      "epoch": 0.07419712070874862,
+      "grad_norm": 0.7619237899780273,
+      "kl": 0.00284576416015625,
+      "learning_rate": 9.58125e-07,
+      "loss": 0.026691121514886618,
+      "reward": 2.3450592160224915,
+      "reward_std": 0.2740027904510498,
+      "rewards/GDino": 0.8025760054588318,
+      "rewards/GIT": 0.5677543580532074,
+      "rewards/HPSv2": 0.2594585418701172,
+      "rewards/ORM": 0.7152703106403351,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.875,
+      "step": 67
+    },
+    {
+      "completion_length": 50.171875,
+      "epoch": 0.0753045404208195,
+      "grad_norm": 0.4760603904724121,
+      "kl": 0.0030517578125,
+      "learning_rate": 9.575e-07,
+      "loss": 0.022392848506569862,
+      "reward": 1.6361079216003418,
+      "reward_std": 0.33574268221855164,
+      "rewards/GDino": 0.6061920523643494,
+      "rewards/GIT": 0.31722745299339294,
+      "rewards/HPSv2": 0.2595634460449219,
+      "rewards/ORM": 0.453125,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.5,
+      "step": 68
+    },
+    {
+      "completion_length": 55.046875,
+      "epoch": 0.07641196013289037,
+      "grad_norm": 0.5907943248748779,
+      "kl": 0.00336456298828125,
+      "learning_rate": 9.56875e-07,
+      "loss": -0.0030646873638033867,
+      "reward": 2.119426429271698,
+      "reward_std": 0.298831045627594,
+      "rewards/GDino": 0.8028125166893005,
+      "rewards/GIT": 0.3893257826566696,
+      "rewards/HPSv2": 0.2710380554199219,
+      "rewards/ORM": 0.65625,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.4375,
+      "step": 69
+    },
+    {
+      "completion_length": 51.140625,
+      "epoch": 0.07751937984496124,
+      "grad_norm": 0.47751373052597046,
+      "kl": 0.00359344482421875,
+      "learning_rate": 9.5625e-07,
+      "loss": -0.011344656813889742,
+      "reward": 1.4646188020706177,
+      "reward_std": 0.5817874372005463,
+      "rewards/GDino": 0.5935695767402649,
+      "rewards/GIT": 0.23897356167435646,
+      "rewards/HPSv2": 0.25234222412109375,
+      "rewards/ORM": 0.37973344326019287,
+      "self_certainty_semantic": -24.875,
+      "self_certainty_token": -20.9375,
+      "step": 70
+    },
+    {
+      "completion_length": 55.609375,
+      "epoch": 0.07862679955703211,
+      "grad_norm": 0.5281980633735657,
+      "kl": 0.0030364990234375,
+      "learning_rate": 9.556249999999999e-07,
+      "loss": -0.023217559792101383,
+      "reward": 1.856022596359253,
+      "reward_std": 0.4435942769050598,
+      "rewards/GDino": 0.6947268545627594,
+      "rewards/GIT": 0.28702250868082047,
+      "rewards/HPSv2": 0.26489830017089844,
+      "rewards/ORM": 0.609375,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.5,
+      "step": 71
+    },
+    {
+      "completion_length": 42.390625,
+      "epoch": 0.07973421926910298,
+      "grad_norm": 0.4538242518901825,
+      "kl": 0.002960205078125,
+      "learning_rate": 9.55e-07,
+      "loss": 0.016265914775431156,
+      "reward": 1.911847174167633,
+      "reward_std": 0.4016146659851074,
+      "rewards/GDino": 0.6731474995613098,
+      "rewards/GIT": 0.46439771354198456,
+      "rewards/HPSv2": 0.2497406005859375,
+      "rewards/ORM": 0.524561420083046,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -20.875,
+      "step": 72
+    },
+    {
+      "completion_length": 53.28125,
+      "epoch": 0.08084163898117387,
+      "grad_norm": 0.5773823261260986,
+      "kl": 0.00330352783203125,
+      "learning_rate": 9.543749999999999e-07,
+      "loss": -0.0016377167776226997,
+      "reward": 2.114488363265991,
+      "reward_std": 0.44427454471588135,
+      "rewards/GDino": 0.8240922689437866,
+      "rewards/GIT": 0.4950668513774872,
+      "rewards/HPSv2": 0.24412155151367188,
+      "rewards/ORM": 0.5512077808380127,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.0,
+      "step": 73
+    },
+    {
+      "completion_length": 56.296875,
+      "epoch": 0.08194905869324474,
+      "grad_norm": 0.43449509143829346,
+      "kl": 0.0035247802734375,
+      "learning_rate": 9.5375e-07,
+      "loss": 0.03005522396415472,
+      "reward": 2.32301664352417,
+      "reward_std": 0.22542773187160492,
+      "rewards/GDino": 0.864062488079071,
+      "rewards/GIT": 0.5282620340585709,
+      "rewards/HPSv2": 0.25408363342285156,
+      "rewards/ORM": 0.6766084730625153,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -22.25,
+      "step": 74
+    },
+    {
+      "completion_length": 67.6875,
+      "epoch": 0.08305647840531562,
+      "grad_norm": 0.4218258857727051,
+      "kl": 0.0028228759765625,
+      "learning_rate": 9.53125e-07,
+      "loss": 0.015081442426890135,
+      "reward": 1.7625158429145813,
+      "reward_std": 0.4334114193916321,
+      "rewards/GDino": 0.6663236618041992,
+      "rewards/GIT": 0.26877461373806,
+      "rewards/HPSv2": 0.2647876739501953,
+      "rewards/ORM": 0.5626298785209656,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.125,
+      "step": 75
+    },
+    {
+      "completion_length": 62.15625,
+      "epoch": 0.08416389811738649,
+      "grad_norm": 0.45278123021125793,
+      "kl": 0.00312042236328125,
+      "learning_rate": 9.525e-07,
+      "loss": 0.01650754688307643,
+      "reward": 2.2938578128814697,
+      "reward_std": 0.5077499151229858,
+      "rewards/GDino": 0.7734375,
+      "rewards/GIT": 0.6401466727256775,
+      "rewards/HPSv2": 0.2568778991699219,
+      "rewards/ORM": 0.6233955323696136,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.125,
+      "step": 76
+    },
+    {
+      "completion_length": 50.984375,
+      "epoch": 0.08527131782945736,
+      "grad_norm": 0.5513558387756348,
+      "kl": 0.004730224609375,
+      "learning_rate": 9.51875e-07,
+      "loss": -0.008258584188297391,
+      "reward": 1.6354877948760986,
+      "reward_std": 0.5420883148908615,
+      "rewards/GDino": 0.643737405538559,
+      "rewards/GIT": 0.20579323172569275,
+      "rewards/HPSv2": 0.2405567169189453,
+      "rewards/ORM": 0.5454003810882568,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -22.1875,
+      "step": 77
+    },
+    {
+      "completion_length": 56.390625,
+      "epoch": 0.08637873754152824,
+      "grad_norm": 0.9578920602798462,
+      "kl": 0.00360107421875,
+      "learning_rate": 9.5125e-07,
+      "loss": 0.0016261041164398193,
+      "reward": 2.061507523059845,
+      "reward_std": 0.2758500352501869,
+      "rewards/GDino": 0.7561410367488861,
+      "rewards/GIT": 0.33666322380304337,
+      "rewards/HPSv2": 0.2762489318847656,
+      "rewards/ORM": 0.6924542784690857,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.1875,
+      "step": 78
+    },
+    {
+      "completion_length": 57.375,
+      "epoch": 0.08748615725359911,
+      "grad_norm": 0.46459418535232544,
+      "kl": 0.004241943359375,
+      "learning_rate": 9.50625e-07,
+      "loss": -0.019409675151109695,
+      "reward": 2.298323154449463,
+      "reward_std": 0.22066934406757355,
+      "rewards/GDino": 0.8136925399303436,
+      "rewards/GIT": 0.6333461850881577,
+      "rewards/HPSv2": 0.27008056640625,
+      "rewards/ORM": 0.5812040567398071,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.875,
+      "step": 79
+    },
+    {
+      "completion_length": 60.5625,
+      "epoch": 0.08859357696567,
+      "grad_norm": 0.4274587631225586,
+      "kl": 0.004058837890625,
+      "learning_rate": 9.499999999999999e-07,
+      "loss": 0.013256619684398174,
+      "reward": 1.6786987781524658,
+      "reward_std": 0.3984425514936447,
+      "rewards/GDino": 0.6007516384124756,
+      "rewards/GIT": 0.18326736986637115,
+      "rewards/HPSv2": 0.2720355987548828,
+      "rewards/ORM": 0.6226442158222198,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.3125,
+      "step": 80
+    },
+    {
+      "completion_length": 58.03125,
+      "epoch": 0.08970099667774087,
+      "grad_norm": 0.9172859191894531,
+      "kl": 0.00426483154296875,
+      "learning_rate": 9.493749999999999e-07,
+      "loss": 0.003496276680380106,
+      "reward": 2.106017231941223,
+      "reward_std": 0.30050399899482727,
+      "rewards/GDino": 0.7440759837627411,
+      "rewards/GIT": 0.3581302911043167,
+      "rewards/HPSv2": 0.27126121520996094,
+      "rewards/ORM": 0.7325496971607208,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.875,
+      "step": 81
+    },
+    {
+      "completion_length": 49.5625,
+      "epoch": 0.09080841638981174,
+      "grad_norm": 0.4841405153274536,
+      "kl": 0.00412750244140625,
+      "learning_rate": 9.487499999999999e-07,
+      "loss": 0.025506282225251198,
+      "reward": 1.6879253387451172,
+      "reward_std": 0.42353254556655884,
+      "rewards/GDino": 0.6098452508449554,
+      "rewards/GIT": 0.38033944368362427,
+      "rewards/HPSv2": 0.2658271789550781,
+      "rewards/ORM": 0.43191343545913696,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -20.9375,
+      "step": 82
+    },
+    {
+      "completion_length": 48.328125,
+      "epoch": 0.09191583610188261,
+      "grad_norm": 0.492243230342865,
+      "kl": 0.00345611572265625,
+      "learning_rate": 9.481249999999999e-07,
+      "loss": -0.0034960508346557617,
+      "reward": 2.1111596822738647,
+      "reward_std": 0.41540510952472687,
+      "rewards/GDino": 0.7717877924442291,
+      "rewards/GIT": 0.4860316216945648,
+      "rewards/HPSv2": 0.2670021057128906,
+      "rewards/ORM": 0.5863381326198578,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.4375,
+      "step": 83
+    },
+    {
+      "completion_length": 66.3125,
+      "epoch": 0.09302325581395349,
+      "grad_norm": 0.5617808699607849,
+      "kl": 0.004180908203125,
+      "learning_rate": 9.474999999999999e-07,
+      "loss": 0.003248518332839012,
+      "reward": 2.094790816307068,
+      "reward_std": 0.3879907354712486,
+      "rewards/GDino": 0.7973622679710388,
+      "rewards/GIT": 0.632976621389389,
+      "rewards/HPSv2": 0.24137306213378906,
+      "rewards/ORM": 0.4230788052082062,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.1875,
+      "step": 84
+    },
+    {
+      "completion_length": 51.40625,
+      "epoch": 0.09413067552602436,
+      "grad_norm": 0.5695884823799133,
+      "kl": 0.003204345703125,
+      "learning_rate": 9.468749999999999e-07,
+      "loss": 0.012543351389467716,
+      "reward": 1.8675293326377869,
+      "reward_std": 0.4282868355512619,
+      "rewards/GDino": 0.6550000011920929,
+      "rewards/GIT": 0.33260630816221237,
+      "rewards/HPSv2": 0.24515533447265625,
+      "rewards/ORM": 0.6347676515579224,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.375,
+      "step": 85
+    },
+    {
+      "completion_length": 48.296875,
+      "epoch": 0.09523809523809523,
+      "grad_norm": 0.46590158343315125,
+      "kl": 0.00469970703125,
+      "learning_rate": 9.462499999999999e-07,
+      "loss": 0.00347991194576025,
+      "reward": 2.2731298208236694,
+      "reward_std": 0.383390873670578,
+      "rewards/GDino": 0.8246111273765564,
+      "rewards/GIT": 0.33447980135679245,
+      "rewards/HPSv2": 0.29212188720703125,
+      "rewards/ORM": 0.821916937828064,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -22.0,
+      "step": 86
+    },
+    {
+      "completion_length": 54.390625,
+      "epoch": 0.09634551495016612,
+      "grad_norm": 0.5397853255271912,
+      "kl": 0.004425048828125,
+      "learning_rate": 9.45625e-07,
+      "loss": 0.008617566898465157,
+      "reward": 2.2459940314292908,
+      "reward_std": 0.4676859378814697,
+      "rewards/GDino": 0.7356771230697632,
+      "rewards/GIT": 0.46453191339969635,
+      "rewards/HPSv2": 0.26766395568847656,
+      "rewards/ORM": 0.7781210243701935,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.0,
+      "step": 87
+    },
+    {
+      "completion_length": 42.15625,
+      "epoch": 0.09745293466223699,
+      "grad_norm": 0.48280662298202515,
+      "kl": 0.00406646728515625,
+      "learning_rate": 9.45e-07,
+      "loss": 0.016791983507573605,
+      "reward": 2.1528985500335693,
+      "reward_std": 0.44025059044361115,
+      "rewards/GDino": 0.7985424101352692,
+      "rewards/GIT": 0.47699007391929626,
+      "rewards/HPSv2": 0.2789325714111328,
+      "rewards/ORM": 0.5984334945678711,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.875,
+      "step": 88
+    },
+    {
+      "completion_length": 48.25,
+      "epoch": 0.09856035437430787,
+      "grad_norm": 0.4512772560119629,
+      "kl": 0.00405120849609375,
+      "learning_rate": 9.44375e-07,
+      "loss": -0.009609811007976532,
+      "reward": 2.155352771282196,
+      "reward_std": 0.3193782642483711,
+      "rewards/GDino": 0.7525902688503265,
+      "rewards/GIT": 0.4481022357940674,
+      "rewards/HPSv2": 0.2619743347167969,
+      "rewards/ORM": 0.6926859021186829,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.125,
+      "step": 89
+    },
+    {
+      "completion_length": 47.78125,
+      "epoch": 0.09966777408637874,
+      "grad_norm": 0.5204576849937439,
+      "kl": 0.004425048828125,
+      "learning_rate": 9.4375e-07,
+      "loss": -0.017570611089468002,
+      "reward": 2.3318194150924683,
+      "reward_std": 0.3641355484724045,
+      "rewards/GDino": 0.854687511920929,
+      "rewards/GIT": 0.6271218061447144,
+      "rewards/HPSv2": 0.26286888122558594,
+      "rewards/ORM": 0.5871412754058838,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.4375,
+      "step": 90
+    },
+    {
+      "completion_length": 54.9375,
+      "epoch": 0.10077519379844961,
+      "grad_norm": 0.7515896558761597,
+      "kl": 0.0042266845703125,
+      "learning_rate": 9.43125e-07,
+      "loss": 0.022024651989340782,
+      "reward": 1.7255874276161194,
+      "reward_std": 0.3924099802970886,
+      "rewards/GDino": 0.6800954043865204,
+      "rewards/GIT": 0.41760827600955963,
+      "rewards/HPSv2": 0.22957611083984375,
+      "rewards/ORM": 0.3983076214790344,
+      "self_certainty_semantic": -24.875,
+      "self_certainty_token": -21.125,
+      "step": 91
+    },
+    {
+      "completion_length": 51.90625,
+      "epoch": 0.10188261351052048,
+      "grad_norm": 0.6844750046730042,
+      "kl": 0.004730224609375,
+      "learning_rate": 9.425e-07,
+      "loss": 0.017017286270856857,
+      "reward": 1.7472361326217651,
+      "reward_std": 0.49342362582683563,
+      "rewards/GDino": 0.7615998685359955,
+      "rewards/GIT": 0.3799494504928589,
+      "rewards/HPSv2": 0.2450580596923828,
+      "rewards/ORM": 0.36062875390052795,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.0625,
+      "step": 92
+    },
+    {
+      "completion_length": 52.5625,
+      "epoch": 0.10299003322259136,
+      "grad_norm": 0.476144403219223,
+      "kl": 0.004547119140625,
+      "learning_rate": 9.41875e-07,
+      "loss": -0.006627652794122696,
+      "reward": 2.3529324531555176,
+      "reward_std": 0.38789400458335876,
+      "rewards/GDino": 0.8122400343418121,
+      "rewards/GIT": 0.40920257568359375,
+      "rewards/HPSv2": 0.25894737243652344,
+      "rewards/ORM": 0.8725424408912659,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -20.75,
+      "step": 93
+    },
+    {
+      "completion_length": 45.234375,
+      "epoch": 0.10409745293466224,
+      "grad_norm": 0.4303518235683441,
+      "kl": 0.00390625,
+      "learning_rate": 9.4125e-07,
+      "loss": 0.002329372800886631,
+      "reward": 2.063507556915283,
+      "reward_std": 0.4875355362892151,
+      "rewards/GDino": 0.8157378733158112,
+      "rewards/GIT": 0.2162991166114807,
+      "rewards/HPSv2": 0.2860240936279297,
+      "rewards/ORM": 0.7454463839530945,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -22.25,
+      "step": 94
+    },
+    {
+      "completion_length": 54.21875,
+      "epoch": 0.10520487264673312,
+      "grad_norm": 0.9745371341705322,
+      "kl": 0.004852294921875,
+      "learning_rate": 9.40625e-07,
+      "loss": 0.015892890747636557,
+      "reward": 2.4900766611099243,
+      "reward_std": 0.33158986270427704,
+      "rewards/GDino": 0.9456690549850464,
+      "rewards/GIT": 0.7110534906387329,
+      "rewards/HPSv2": 0.2568836212158203,
+      "rewards/ORM": 0.5764705836772919,
+      "self_certainty_semantic": -25.625,
+      "self_certainty_token": -21.3125,
+      "step": 95
+    },
+    {
+      "completion_length": 62.953125,
+      "epoch": 0.10631229235880399,
+      "grad_norm": 1.6108874082565308,
+      "kl": 0.00475311279296875,
+      "learning_rate": 9.399999999999999e-07,
+      "loss": 0.012537557166069746,
+      "reward": 2.4274561405181885,
+      "reward_std": 0.3028244078159332,
+      "rewards/GDino": 0.9155160486698151,
+      "rewards/GIT": 0.6933247745037079,
+      "rewards/HPSv2": 0.25919437408447266,
+      "rewards/ORM": 0.5594209432601929,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.3125,
+      "step": 96
+    },
+    {
+      "completion_length": 44.890625,
+      "epoch": 0.10741971207087486,
+      "grad_norm": 0.42777886986732483,
+      "kl": 0.0064697265625,
+      "learning_rate": 9.393749999999999e-07,
+      "loss": 0.006582918576896191,
+      "reward": 1.7229499220848083,
+      "reward_std": 0.29571742564439774,
+      "rewards/GDino": 0.6976552903652191,
+      "rewards/GIT": 0.17514611035585403,
+      "rewards/HPSv2": 0.2757740020751953,
+      "rewards/ORM": 0.5743745565414429,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.5,
+      "step": 97
+    },
+    {
+      "completion_length": 52.78125,
+      "epoch": 0.10852713178294573,
+      "grad_norm": 0.4346785247325897,
+      "kl": 0.00446319580078125,
+      "learning_rate": 9.387499999999999e-07,
+      "loss": 0.010664775501936674,
+      "reward": 1.9896260499954224,
+      "reward_std": 0.5384568274021149,
+      "rewards/GDino": 0.7534899115562439,
+      "rewards/GIT": 0.416723370552063,
+      "rewards/HPSv2": 0.25490760803222656,
+      "rewards/ORM": 0.5645051300525665,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.4375,
+      "step": 98
+    },
+    {
+      "completion_length": 53.546875,
+      "epoch": 0.10963455149501661,
+      "grad_norm": 0.4226502478122711,
+      "kl": 0.00543212890625,
+      "learning_rate": 9.381249999999999e-07,
+      "loss": -0.009754271944984794,
+      "reward": 2.1711018085479736,
+      "reward_std": 0.3036491945385933,
+      "rewards/GDino": 0.8239583373069763,
+      "rewards/GIT": 0.6844146698713303,
+      "rewards/HPSv2": 0.24951934814453125,
+      "rewards/ORM": 0.41320937871932983,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -22.0,
+      "step": 99
+    },
+    {
+      "completion_length": 48.0625,
+      "epoch": 0.11074197120708748,
+      "grad_norm": 0.4250389039516449,
+      "kl": 0.00537109375,
+      "learning_rate": 9.374999999999999e-07,
+      "loss": -0.014408082235604525,
+      "reward": 1.9375371932983398,
+      "reward_std": 0.4484590142965317,
+      "rewards/GDino": 0.6897697150707245,
+      "rewards/GIT": 0.4094943553209305,
+      "rewards/HPSv2": 0.2472515106201172,
+      "rewards/ORM": 0.5910216569900513,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.4375,
+      "step": 100
+    },
+    {
+      "completion_length": 53.546875,
+      "epoch": 0.11184939091915837,
+      "grad_norm": 0.9339170455932617,
+      "kl": 0.00566864013671875,
+      "learning_rate": 9.368749999999999e-07,
+      "loss": 0.003982411697506905,
+      "reward": 2.2582755088806152,
+      "reward_std": 0.41422703862190247,
+      "rewards/GDino": 0.8809943795204163,
+      "rewards/GIT": 0.5047063827514648,
+      "rewards/HPSv2": 0.27569580078125,
+      "rewards/ORM": 0.5968790352344513,
+      "self_certainty_semantic": -25.6875,
+      "self_certainty_token": -20.6875,
+      "step": 101
+    },
+    {
+      "completion_length": 49.59375,
+      "epoch": 0.11295681063122924,
+      "grad_norm": 0.3932209610939026,
+      "kl": 0.0052032470703125,
+      "learning_rate": 9.3625e-07,
+      "loss": -0.015652057249099016,
+      "reward": 2.285743832588196,
+      "reward_std": 0.4700127840042114,
+      "rewards/GDino": 0.7444302141666412,
+      "rewards/GIT": 0.5256561636924744,
+      "rewards/HPSv2": 0.2712440490722656,
+      "rewards/ORM": 0.7444134056568146,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.5625,
+      "step": 102
+    },
+    {
+      "completion_length": 54.6875,
+      "epoch": 0.11406423034330011,
+      "grad_norm": 0.44802016019821167,
+      "kl": 0.006866455078125,
+      "learning_rate": 9.35625e-07,
+      "loss": 0.004784752381965518,
+      "reward": 1.801784873008728,
+      "reward_std": 0.5139727592468262,
+      "rewards/GDino": 0.6817658245563507,
+      "rewards/GIT": 0.18415232002735138,
+      "rewards/HPSv2": 0.2674713134765625,
+      "rewards/ORM": 0.6683953106403351,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.5,
+      "step": 103
+    },
+    {
+      "completion_length": 41.671875,
+      "epoch": 0.11517165005537099,
+      "grad_norm": 0.5610178709030151,
+      "kl": 0.0048675537109375,
+      "learning_rate": 9.35e-07,
+      "loss": 0.012662995606660843,
+      "reward": 2.2165188789367676,
+      "reward_std": 0.4234919399023056,
+      "rewards/GDino": 0.8522021770477295,
+      "rewards/GIT": 0.4973383694887161,
+      "rewards/HPSv2": 0.25607872009277344,
+      "rewards/ORM": 0.6108995676040649,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.875,
+      "step": 104
+    },
+    {
+      "completion_length": 48.609375,
+      "epoch": 0.11627906976744186,
+      "grad_norm": 0.5205046534538269,
+      "kl": 0.00438690185546875,
+      "learning_rate": 9.34375e-07,
+      "loss": 0.010773615911602974,
+      "reward": 1.9969267845153809,
+      "reward_std": 0.462581530213356,
+      "rewards/GDino": 0.739062488079071,
+      "rewards/GIT": 0.5269564837217331,
+      "rewards/HPSv2": 0.2528705596923828,
+      "rewards/ORM": 0.4780370891094208,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -20.9375,
+      "step": 105
+    },
+    {
+      "completion_length": 53.09375,
+      "epoch": 0.11738648947951273,
+      "grad_norm": 0.4945337772369385,
+      "kl": 0.00521087646484375,
+      "learning_rate": 9.3375e-07,
+      "loss": -0.015708873979747295,
+      "reward": 1.9371621012687683,
+      "reward_std": 0.3034388795495033,
+      "rewards/GDino": 0.7275120615959167,
+      "rewards/GIT": 0.5758572816848755,
+      "rewards/HPSv2": 0.26700592041015625,
+      "rewards/ORM": 0.3667868673801422,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -20.75,
+      "step": 106
+    },
+    {
+      "completion_length": 47.890625,
+      "epoch": 0.1184939091915836,
+      "grad_norm": 0.5665311217308044,
+      "kl": 0.007598876953125,
+      "learning_rate": 9.33125e-07,
+      "loss": -0.003110818797722459,
+      "reward": 2.5077059268951416,
+      "reward_std": 0.4203761965036392,
+      "rewards/GDino": 0.9479166865348816,
+      "rewards/GIT": 0.7534970641136169,
+      "rewards/HPSv2": 0.2581634521484375,
+      "rewards/ORM": 0.548128753900528,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -20.6875,
+      "step": 107
+    },
+    {
+      "completion_length": 42.859375,
+      "epoch": 0.11960132890365449,
+      "grad_norm": 0.5378500819206238,
+      "kl": 0.0073089599609375,
+      "learning_rate": 9.325e-07,
+      "loss": -0.0038322817999869585,
+      "reward": 2.261056423187256,
+      "reward_std": 0.27708302438259125,
+      "rewards/GDino": 0.7834341526031494,
+      "rewards/GIT": 0.4610650986433029,
+      "rewards/HPSv2": 0.28486061096191406,
+      "rewards/ORM": 0.731696605682373,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.75,
+      "step": 108
+    },
+    {
+      "completion_length": 50.828125,
+      "epoch": 0.12070874861572536,
+      "grad_norm": 0.4483701288700104,
+      "kl": 0.0067138671875,
+      "learning_rate": 9.31875e-07,
+      "loss": 0.021743599325418472,
+      "reward": 2.118706166744232,
+      "reward_std": 0.3737848997116089,
+      "rewards/GDino": 0.793749988079071,
+      "rewards/GIT": 0.5421168804168701,
+      "rewards/HPSv2": 0.27524757385253906,
+      "rewards/ORM": 0.5075916796922684,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -22.4375,
+      "step": 109
+    },
+    {
+      "completion_length": 44.40625,
+      "epoch": 0.12181616832779624,
+      "grad_norm": 5.2752509117126465,
+      "kl": 0.005615234375,
+      "learning_rate": 9.3125e-07,
+      "loss": -0.008640175685286522,
+      "reward": 2.223414659500122,
+      "reward_std": 0.46220165491104126,
+      "rewards/GDino": 0.8336420953273773,
+      "rewards/GIT": 0.4199307709932327,
+      "rewards/HPSv2": 0.24015045166015625,
+      "rewards/ORM": 0.7296914756298065,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.9375,
+      "step": 110
+    },
+    {
+      "completion_length": 54.234375,
+      "epoch": 0.12292358803986711,
+      "grad_norm": 1.4650001525878906,
+      "kl": 0.0050811767578125,
+      "learning_rate": 9.30625e-07,
+      "loss": 0.01813027122989297,
+      "reward": 1.980902910232544,
+      "reward_std": 0.3374909907579422,
+      "rewards/GDino": 0.7292370200157166,
+      "rewards/GIT": 0.3778613805770874,
+      "rewards/HPSv2": 0.27063751220703125,
+      "rewards/ORM": 0.603166937828064,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.875,
+      "step": 111
+    },
+    {
+      "completion_length": 42.015625,
+      "epoch": 0.12403100775193798,
+      "grad_norm": 2.8602488040924072,
+      "kl": 0.014862060546875,
+      "learning_rate": 9.3e-07,
+      "loss": 0.001048431033268571,
+      "reward": 2.127329468727112,
+      "reward_std": 0.3023644834756851,
+      "rewards/GDino": 0.7870483100414276,
+      "rewards/GIT": 0.3371267020702362,
+      "rewards/HPSv2": 0.2804222106933594,
+      "rewards/ORM": 0.7227321267127991,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.5625,
+      "step": 112
+    },
+    {
+      "completion_length": 40.921875,
+      "epoch": 0.12513842746400886,
+      "grad_norm": 0.47990044951438904,
+      "kl": 0.006256103515625,
+      "learning_rate": 9.293749999999999e-07,
+      "loss": -0.0006602238863706589,
+      "reward": 1.6797617077827454,
+      "reward_std": 0.38670530915260315,
+      "rewards/GDino": 0.6658706367015839,
+      "rewards/GIT": 0.29068493843078613,
+      "rewards/HPSv2": 0.2770404815673828,
+      "rewards/ORM": 0.44616562128067017,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.8125,
+      "step": 113
+    },
+    {
+      "completion_length": 45.734375,
+      "epoch": 0.12624584717607973,
+      "grad_norm": 0.7622689008712769,
+      "kl": 0.006622314453125,
+      "learning_rate": 9.287499999999999e-07,
+      "loss": 0.01737637398764491,
+      "reward": 2.336304783821106,
+      "reward_std": 0.2698482424020767,
+      "rewards/GDino": 0.7757812738418579,
+      "rewards/GIT": 0.6553223580121994,
+      "rewards/HPSv2": 0.2794780731201172,
+      "rewards/ORM": 0.6257232129573822,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.6875,
+      "step": 114
+    },
+    {
+      "completion_length": 49.65625,
+      "epoch": 0.1273532668881506,
+      "grad_norm": 0.5282474756240845,
+      "kl": 0.007537841796875,
+      "learning_rate": 9.281249999999999e-07,
+      "loss": 0.016482284292578697,
+      "reward": 2.164160192012787,
+      "reward_std": 0.3744830787181854,
+      "rewards/GDino": 0.710269957780838,
+      "rewards/GIT": 0.5369775593280792,
+      "rewards/HPSv2": 0.2497711181640625,
+      "rewards/ORM": 0.6671415567398071,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.6875,
+      "step": 115
+    },
+    {
+      "completion_length": 51.953125,
+      "epoch": 0.12846068660022147,
+      "grad_norm": 0.477001816034317,
+      "kl": 0.0054931640625,
+      "learning_rate": 9.274999999999999e-07,
+      "loss": 0.005994495470076799,
+      "reward": 2.3729158639907837,
+      "reward_std": 0.29741741716861725,
+      "rewards/GDino": 0.7775339484214783,
+      "rewards/GIT": 0.54022216796875,
+      "rewards/HPSv2": 0.26922607421875,
+      "rewards/ORM": 0.7859334647655487,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -20.5,
+      "step": 116
+    },
+    {
+      "completion_length": 47.203125,
+      "epoch": 0.12956810631229235,
+      "grad_norm": 0.5086694955825806,
+      "kl": 0.007476806640625,
+      "learning_rate": 9.268749999999999e-07,
+      "loss": -0.011362070217728615,
+      "reward": 2.5480291843414307,
+      "reward_std": 0.21939973533153534,
+      "rewards/GDino": 0.8421875238418579,
+      "rewards/GIT": 0.6468265354633331,
+      "rewards/HPSv2": 0.28557777404785156,
+      "rewards/ORM": 0.7734375,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -22.1875,
+      "step": 117
+    },
+    {
+      "completion_length": 47.96875,
+      "epoch": 0.13067552602436322,
+      "grad_norm": 0.4937967360019684,
+      "kl": 0.008392333984375,
+      "learning_rate": 9.2625e-07,
+      "loss": 0.021687609143555164,
+      "reward": 1.9581258296966553,
+      "reward_std": 0.4126932621002197,
+      "rewards/GDino": 0.7584865391254425,
+      "rewards/GIT": 0.40825480222702026,
+      "rewards/HPSv2": 0.2484416961669922,
+      "rewards/ORM": 0.5429428219795227,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.25,
+      "step": 118
+    },
+    {
+      "completion_length": 42.109375,
+      "epoch": 0.13178294573643412,
+      "grad_norm": 0.5172230005264282,
+      "kl": 0.008087158203125,
+      "learning_rate": 9.25625e-07,
+      "loss": -0.019072898663580418,
+      "reward": 2.281066656112671,
+      "reward_std": 0.5171918570995331,
+      "rewards/GDino": 0.777093768119812,
+      "rewards/GIT": 0.48773056268692017,
+      "rewards/HPSv2": 0.2693214416503906,
+      "rewards/ORM": 0.7469209730625153,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.6875,
+      "step": 119
+    },
+    {
+      "completion_length": 47.03125,
+      "epoch": 0.132890365448505,
+      "grad_norm": 0.5932603478431702,
+      "kl": 0.006134033203125,
+      "learning_rate": 9.25e-07,
+      "loss": -0.013699718751013279,
+      "reward": 1.8187614679336548,
+      "reward_std": 0.3037511110305786,
+      "rewards/GDino": 0.6585085391998291,
+      "rewards/GIT": 0.44199828803539276,
+      "rewards/HPSv2": 0.2613792419433594,
+      "rewards/ORM": 0.4568754881620407,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.25,
+      "step": 120
+    },
+    {
+      "completion_length": 43.09375,
+      "epoch": 0.13399778516057587,
+      "grad_norm": 0.5679785013198853,
+      "kl": 0.0063629150390625,
+      "learning_rate": 9.243749999999999e-07,
+      "loss": 0.00992331630550325,
+      "reward": 2.001632511615753,
+      "reward_std": 0.43007975816726685,
+      "rewards/GDino": 0.7286458313465118,
+      "rewards/GIT": 0.4185456335544586,
+      "rewards/HPSv2": 0.25841522216796875,
+      "rewards/ORM": 0.5960258543491364,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -21.4375,
+      "step": 121
+    },
+    {
+      "completion_length": 45.125,
+      "epoch": 0.13510520487264674,
+      "grad_norm": 0.5153163075447083,
+      "kl": 0.0077056884765625,
+      "learning_rate": 9.237499999999999e-07,
+      "loss": 0.023711273446679115,
+      "reward": 1.9004549980163574,
+      "reward_std": 0.30516810715198517,
+      "rewards/GDino": 0.7201891243457794,
+      "rewards/GIT": 0.3620809018611908,
+      "rewards/HPSv2": 0.2901439666748047,
+      "rewards/ORM": 0.5280410796403885,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -20.9375,
+      "step": 122
+    },
+    {
+      "completion_length": 40.984375,
+      "epoch": 0.1362126245847176,
+      "grad_norm": 0.5419678092002869,
+      "kl": 0.0060882568359375,
+      "learning_rate": 9.23125e-07,
+      "loss": 0.002633487805724144,
+      "reward": 1.8504286408424377,
+      "reward_std": 0.48781776428222656,
+      "rewards/GDino": 0.7166666686534882,
+      "rewards/GIT": 0.484364315867424,
+      "rewards/HPSv2": 0.25711822509765625,
+      "rewards/ORM": 0.39227938652038574,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.375,
+      "step": 123
+    },
+    {
+      "completion_length": 54.109375,
+      "epoch": 0.13732004429678848,
+      "grad_norm": 3.3991947174072266,
+      "kl": 0.00714111328125,
+      "learning_rate": 9.225e-07,
+      "loss": 0.010619609151035547,
+      "reward": 2.2070562839508057,
+      "reward_std": 0.4152011424303055,
+      "rewards/GDino": 0.7853051722049713,
+      "rewards/GIT": 0.4775720238685608,
+      "rewards/HPSv2": 0.25039100646972656,
+      "rewards/ORM": 0.6937879621982574,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.625,
+      "step": 124
+    },
+    {
+      "completion_length": 52.390625,
+      "epoch": 0.13842746400885936,
+      "grad_norm": 0.6660559773445129,
+      "kl": 0.00579833984375,
+      "learning_rate": 9.21875e-07,
+      "loss": 0.030846341978758574,
+      "reward": 2.0939546823501587,
+      "reward_std": 0.3831700086593628,
+      "rewards/GDino": 0.8140697479248047,
+      "rewards/GIT": 0.5598450750112534,
+      "rewards/HPSv2": 0.24968528747558594,
+      "rewards/ORM": 0.47035445272922516,
+      "self_certainty_semantic": -25.5625,
+      "self_certainty_token": -21.625,
+      "step": 125
+    },
+    {
+      "completion_length": 40.546875,
+      "epoch": 0.13953488372093023,
+      "grad_norm": 0.4604582190513611,
+      "kl": 0.0074005126953125,
+      "learning_rate": 9.2125e-07,
+      "loss": -0.00871883099898696,
+      "reward": 1.8355292081832886,
+      "reward_std": 0.3674861043691635,
+      "rewards/GDino": 0.7088627219200134,
+      "rewards/GIT": 0.29635151475667953,
+      "rewards/HPSv2": 0.25839805603027344,
+      "rewards/ORM": 0.5719168931245804,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.125,
+      "step": 126
+    },
+    {
+      "completion_length": 54.71875,
+      "epoch": 0.1406423034330011,
+      "grad_norm": 0.8080440759658813,
+      "kl": 0.0090179443359375,
+      "learning_rate": 9.20625e-07,
+      "loss": -0.01499070762656629,
+      "reward": 2.144508123397827,
+      "reward_std": 0.3519645929336548,
+      "rewards/GDino": 0.7293722033500671,
+      "rewards/GIT": 0.42373301088809967,
+      "rewards/HPSv2": 0.27573204040527344,
+      "rewards/ORM": 0.7156709134578705,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.625,
+      "step": 127
+    },
+    {
+      "completion_length": 51.46875,
+      "epoch": 0.14174972314507198,
+      "grad_norm": 0.5151174068450928,
+      "kl": 0.007843017578125,
+      "learning_rate": 9.2e-07,
+      "loss": 0.011134594678878784,
+      "reward": 1.9095789790153503,
+      "reward_std": 0.5153420865535736,
+      "rewards/GDino": 0.7141143381595612,
+      "rewards/GIT": 0.26926109194755554,
+      "rewards/HPSv2": 0.26451873779296875,
+      "rewards/ORM": 0.6616848409175873,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.25,
+      "step": 128
+    },
+    {
+      "completion_length": 49.1875,
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.5960005521774292,
+      "kl": 0.0080413818359375,
+      "learning_rate": 9.19375e-07,
+      "loss": -0.0018172780983150005,
+      "reward": 2.0566558837890625,
+      "reward_std": 0.29967472702264786,
+      "rewards/GDino": 0.7159374058246613,
+      "rewards/GIT": 0.4233300983905792,
+      "rewards/HPSv2": 0.25041770935058594,
+      "rewards/ORM": 0.6669707000255585,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.0625,
+      "step": 129
+    },
+    {
+      "completion_length": 47.046875,
+      "epoch": 0.14396456256921372,
+      "grad_norm": 0.5405691862106323,
+      "kl": 0.0067596435546875,
+      "learning_rate": 9.187499999999999e-07,
+      "loss": 0.032197900116443634,
+      "reward": 1.9141977429389954,
+      "reward_std": 0.455816388130188,
+      "rewards/GDino": 0.7374999821186066,
+      "rewards/GIT": 0.4731251299381256,
+      "rewards/HPSv2": 0.2379016876220703,
+      "rewards/ORM": 0.46567094326019287,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.1875,
+      "step": 130
+    },
+    {
+      "completion_length": 44.125,
+      "epoch": 0.1450719822812846,
+      "grad_norm": 0.524912416934967,
+      "kl": 0.009796142578125,
+      "learning_rate": 9.181249999999999e-07,
+      "loss": -0.0008213929831981659,
+      "reward": 1.8893061876296997,
+      "reward_std": 0.3467349708080292,
+      "rewards/GDino": 0.7044448256492615,
+      "rewards/GIT": 0.35102422535419464,
+      "rewards/HPSv2": 0.26580047607421875,
+      "rewards/ORM": 0.5680365860462189,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.8125,
+      "step": 131
+    },
+    {
+      "completion_length": 34.625,
+      "epoch": 0.1461794019933555,
+      "grad_norm": 0.8808599710464478,
+      "kl": 0.011566162109375,
+      "learning_rate": 9.174999999999999e-07,
+      "loss": -0.003930883482098579,
+      "reward": 2.177262306213379,
+      "reward_std": 0.5171742737293243,
+      "rewards/GDino": 0.8225250542163849,
+      "rewards/GIT": 0.4376496821641922,
+      "rewards/HPSv2": 0.28387451171875,
+      "rewards/ORM": 0.6332131624221802,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.8125,
+      "step": 132
+    },
+    {
+      "completion_length": 48.515625,
+      "epoch": 0.14728682170542637,
+      "grad_norm": 0.5650766491889954,
+      "kl": 0.006622314453125,
+      "learning_rate": 9.168749999999999e-07,
+      "loss": -0.00018032779917120934,
+      "reward": 2.5767033100128174,
+      "reward_std": 0.21334625780582428,
+      "rewards/GDino": 0.768750011920929,
+      "rewards/GIT": 0.6930812895298004,
+      "rewards/HPSv2": 0.2711219787597656,
+      "rewards/ORM": 0.8437499701976776,
+      "self_certainty_semantic": -25.5,
+      "self_certainty_token": -21.3125,
+      "step": 133
+    },
+    {
+      "completion_length": 44.640625,
+      "epoch": 0.14839424141749724,
+      "grad_norm": 0.5156822800636292,
+      "kl": 0.004974365234375,
+      "learning_rate": 9.1625e-07,
+      "loss": 0.002806268632411957,
+      "reward": 1.9863407611846924,
+      "reward_std": 0.49281907081604004,
+      "rewards/GDino": 0.7205729484558105,
+      "rewards/GIT": 0.500615194439888,
+      "rewards/HPSv2": 0.25568580627441406,
+      "rewards/ORM": 0.5094669014215469,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.5,
+      "step": 134
+    },
+    {
+      "completion_length": 45.390625,
+      "epoch": 0.14950166112956811,
+      "grad_norm": 0.8464062809944153,
+      "kl": 0.0070343017578125,
+      "learning_rate": 9.15625e-07,
+      "loss": -0.019531114026904106,
+      "reward": 2.0325437784194946,
+      "reward_std": 0.40662893652915955,
+      "rewards/GDino": 0.7845472693443298,
+      "rewards/GIT": 0.48149144649505615,
+      "rewards/HPSv2": 0.2637767791748047,
+      "rewards/ORM": 0.5027283281087875,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -20.3125,
+      "step": 135
+    },
+    {
+      "completion_length": 51.34375,
+      "epoch": 0.150609080841639,
+      "grad_norm": 0.5737318992614746,
+      "kl": 0.0070343017578125,
+      "learning_rate": 9.15e-07,
+      "loss": -0.0035284715704619884,
+      "reward": 1.6841511130332947,
+      "reward_std": 0.3555753082036972,
+      "rewards/GDino": 0.6274834871292114,
+      "rewards/GIT": 0.27718986570835114,
+      "rewards/HPSv2": 0.2579154968261719,
+      "rewards/ORM": 0.521562248468399,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -21.0,
+      "step": 136
+    },
+    {
+      "completion_length": 44.890625,
+      "epoch": 0.15171650055370986,
+      "grad_norm": 0.9722269773483276,
+      "kl": 0.009033203125,
+      "learning_rate": 9.14375e-07,
+      "loss": -0.010386745911091566,
+      "reward": 2.115400731563568,
+      "reward_std": 0.4073975533246994,
+      "rewards/GDino": 0.7504827678203583,
+      "rewards/GIT": 0.5390563532710075,
+      "rewards/HPSv2": 0.25206947326660156,
+      "rewards/ORM": 0.5737921595573425,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.625,
+      "step": 137
+    },
+    {
+      "completion_length": 45.75,
+      "epoch": 0.15282392026578073,
+      "grad_norm": 0.7052740454673767,
+      "kl": 0.010101318359375,
+      "learning_rate": 9.137499999999999e-07,
+      "loss": -0.009516147896647453,
+      "reward": 1.8091301918029785,
+      "reward_std": 0.3067747950553894,
+      "rewards/GDino": 0.6395186185836792,
+      "rewards/GIT": 0.19299907237291336,
+      "rewards/HPSv2": 0.26455116271972656,
+      "rewards/ORM": 0.7120613753795624,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.1875,
+      "step": 138
+    },
+    {
+      "completion_length": 52.609375,
+      "epoch": 0.1539313399778516,
+      "grad_norm": 0.49324488639831543,
+      "kl": 0.0060577392578125,
+      "learning_rate": 9.131249999999999e-07,
+      "loss": 0.019678042270243168,
+      "reward": 2.08823698759079,
+      "reward_std": 0.3092179298400879,
+      "rewards/GDino": 0.7869435846805573,
+      "rewards/GIT": 0.3419180363416672,
+      "rewards/HPSv2": 0.27410125732421875,
+      "rewards/ORM": 0.6852740943431854,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.5625,
+      "step": 139
+    },
+    {
+      "completion_length": 44.65625,
+      "epoch": 0.15503875968992248,
+      "grad_norm": 0.46719232201576233,
+      "kl": 0.005218505859375,
+      "learning_rate": 9.124999999999999e-07,
+      "loss": -0.016016804613173008,
+      "reward": 2.2374342679977417,
+      "reward_std": 0.4363926351070404,
+      "rewards/GDino": 0.8431436419487,
+      "rewards/GIT": 0.5656594336032867,
+      "rewards/HPSv2": 0.25456809997558594,
+      "rewards/ORM": 0.5740630030632019,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.5,
+      "step": 140
+    },
+    {
+      "completion_length": 58.671875,
+      "epoch": 0.15614617940199335,
+      "grad_norm": 0.4546290934085846,
+      "kl": 0.004730224609375,
+      "learning_rate": 9.11875e-07,
+      "loss": -0.0007884092628955841,
+      "reward": 2.369232416152954,
+      "reward_std": 0.263886496424675,
+      "rewards/GDino": 0.7109375298023224,
+      "rewards/GIT": 0.7504715323448181,
+      "rewards/HPSv2": 0.245635986328125,
+      "rewards/ORM": 0.6621872782707214,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.75,
+      "step": 141
+    },
+    {
+      "completion_length": 42.4375,
+      "epoch": 0.15725359911406422,
+      "grad_norm": 0.9214792251586914,
+      "kl": 0.009521484375,
+      "learning_rate": 9.1125e-07,
+      "loss": 0.003012734232470393,
+      "reward": 2.1930705904960632,
+      "reward_std": 0.39259086549282074,
+      "rewards/GDino": 0.7755208611488342,
+      "rewards/GIT": 0.5075857639312744,
+      "rewards/HPSv2": 0.2599220275878906,
+      "rewards/ORM": 0.6500419527292252,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -20.625,
+      "step": 142
+    },
+    {
+      "completion_length": 44.984375,
+      "epoch": 0.1583610188261351,
+      "grad_norm": 0.622131884098053,
+      "kl": 0.007476806640625,
+      "learning_rate": 9.10625e-07,
+      "loss": 0.012470124522224069,
+      "reward": 1.945671796798706,
+      "reward_std": 0.3904338628053665,
+      "rewards/GDino": 0.7124259173870087,
+      "rewards/GIT": 0.36576879024505615,
+      "rewards/HPSv2": 0.2677898406982422,
+      "rewards/ORM": 0.599687248468399,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -20.5625,
+      "step": 143
+    },
+    {
+      "completion_length": 50.4375,
+      "epoch": 0.15946843853820597,
+      "grad_norm": 0.5398712158203125,
+      "kl": 0.007080078125,
+      "learning_rate": 9.1e-07,
+      "loss": 0.009707295335829258,
+      "reward": 1.9473342895507812,
+      "reward_std": 0.411212295293808,
+      "rewards/GDino": 0.6565104126930237,
+      "rewards/GIT": 0.4918062835931778,
+      "rewards/HPSv2": 0.24593448638916016,
+      "rewards/ORM": 0.5530830472707748,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.1875,
+      "step": 144
+    },
+    {
+      "completion_length": 48.828125,
+      "epoch": 0.16057585825027684,
+      "grad_norm": 0.44240400195121765,
+      "kl": 0.0082550048828125,
+      "learning_rate": 9.09375e-07,
+      "loss": -0.015047748805955052,
+      "reward": 1.8875170946121216,
+      "reward_std": 0.37340451776981354,
+      "rewards/GDino": 0.6789085865020752,
+      "rewards/GIT": 0.31243064999580383,
+      "rewards/HPSv2": 0.2805948257446289,
+      "rewards/ORM": 0.6155830323696136,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.375,
+      "step": 145
+    },
+    {
+      "completion_length": 42.234375,
+      "epoch": 0.16168327796234774,
+      "grad_norm": 0.6678792238235474,
+      "kl": 0.0080718994140625,
+      "learning_rate": 9.087499999999999e-07,
+      "loss": 0.015477177686989307,
+      "reward": 2.123531699180603,
+      "reward_std": 0.4475431591272354,
+      "rewards/GDino": 0.8569894731044769,
+      "rewards/GIT": 0.35753606259822845,
+      "rewards/HPSv2": 0.26838111877441406,
+      "rewards/ORM": 0.640625,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.5,
+      "step": 146
+    },
+    {
+      "completion_length": 47.921875,
+      "epoch": 0.16279069767441862,
+      "grad_norm": 0.4761756658554077,
+      "kl": 0.007293701171875,
+      "learning_rate": 9.081249999999999e-07,
+      "loss": -0.008836451917886734,
+      "reward": 1.6021055579185486,
+      "reward_std": 0.30197490751743317,
+      "rewards/GDino": 0.6556249558925629,
+      "rewards/GIT": 0.1688888967037201,
+      "rewards/HPSv2": 0.24634170532226562,
+      "rewards/ORM": 0.5312500298023224,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -22.0,
+      "step": 147
+    },
+    {
+      "completion_length": 46.921875,
+      "epoch": 0.1638981173864895,
+      "grad_norm": 0.6984077095985413,
+      "kl": 0.011322021484375,
+      "learning_rate": 9.074999999999999e-07,
+      "loss": -0.001696310006082058,
+      "reward": 2.1800928115844727,
+      "reward_std": 0.22076455503702164,
+      "rewards/GDino": 0.8397657871246338,
+      "rewards/GIT": 0.3664309233427048,
+      "rewards/HPSv2": 0.2707710266113281,
+      "rewards/ORM": 0.703125,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.625,
+      "step": 148
+    },
+    {
+      "completion_length": 42.984375,
+      "epoch": 0.16500553709856036,
+      "grad_norm": 0.6648632884025574,
+      "kl": 0.008514404296875,
+      "learning_rate": 9.068749999999999e-07,
+      "loss": 0.0001570945605635643,
+      "reward": 2.0893077850341797,
+      "reward_std": 0.40299197286367416,
+      "rewards/GDino": 0.7465624809265137,
+      "rewards/GIT": 0.32270002365112305,
+      "rewards/HPSv2": 0.2650909423828125,
+      "rewards/ORM": 0.7549542784690857,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.0,
+      "step": 149
+    },
+    {
+      "completion_length": 46.765625,
+      "epoch": 0.16611295681063123,
+      "grad_norm": 0.4624801278114319,
+      "kl": 0.00653076171875,
+      "learning_rate": 9.0625e-07,
+      "loss": -0.00981504051014781,
+      "reward": 1.8526134490966797,
+      "reward_std": 0.49954167008399963,
+      "rewards/GDino": 0.7183263897895813,
+      "rewards/GIT": 0.28864332288503647,
+      "rewards/HPSv2": 0.2530975341796875,
+      "rewards/ORM": 0.5925461798906326,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.3125,
+      "step": 150
+    },
+    {
+      "completion_length": 44.265625,
+      "epoch": 0.1672203765227021,
+      "grad_norm": 0.6167420744895935,
+      "kl": 0.008636474609375,
+      "learning_rate": 9.05625e-07,
+      "loss": 0.015115905553102493,
+      "reward": 2.337567687034607,
+      "reward_std": 0.42467789351940155,
+      "rewards/GDino": 0.8551518619060516,
+      "rewards/GIT": 0.40152132511138916,
+      "rewards/HPSv2": 0.2702198028564453,
+      "rewards/ORM": 0.8106746673583984,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.25,
+      "step": 151
+    },
+    {
+      "completion_length": 49.578125,
+      "epoch": 0.16832779623477298,
+      "grad_norm": 0.8095238208770752,
+      "kl": 0.006683349609375,
+      "learning_rate": 9.05e-07,
+      "loss": -0.005870660301297903,
+      "reward": 2.1157608032226562,
+      "reward_std": 0.20268037915229797,
+      "rewards/GDino": 0.8022373914718628,
+      "rewards/GIT": 0.4189887195825577,
+      "rewards/HPSv2": 0.27895164489746094,
+      "rewards/ORM": 0.6155830323696136,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.625,
+      "step": 152
+    },
+    {
+      "completion_length": 41.84375,
+      "epoch": 0.16943521594684385,
+      "grad_norm": 0.5198939442634583,
+      "kl": 0.0080718994140625,
+      "learning_rate": 9.04375e-07,
+      "loss": -0.014349173055961728,
+      "reward": 2.219430923461914,
+      "reward_std": 0.31192296743392944,
+      "rewards/GDino": 0.7789298295974731,
+      "rewards/GIT": 0.6309380829334259,
+      "rewards/HPSv2": 0.26394176483154297,
+      "rewards/ORM": 0.5456212162971497,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -22.125,
+      "step": 153
+    },
+    {
+      "completion_length": 39.796875,
+      "epoch": 0.17054263565891473,
+      "grad_norm": 0.4530474543571472,
+      "kl": 0.00823974609375,
+      "learning_rate": 9.0375e-07,
+      "loss": -0.009022563113830984,
+      "reward": 1.62737375497818,
+      "reward_std": 0.3316381424665451,
+      "rewards/GDino": 0.753333568572998,
+      "rewards/GIT": 0.17652657628059387,
+      "rewards/HPSv2": 0.2600135803222656,
+      "rewards/ORM": 0.4375,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -22.25,
+      "step": 154
+    },
+    {
+      "completion_length": 56.828125,
+      "epoch": 0.1716500553709856,
+      "grad_norm": 0.5859449505805969,
+      "kl": 0.01031494140625,
+      "learning_rate": 9.031249999999999e-07,
+      "loss": -0.006706917891278863,
+      "reward": 2.134092330932617,
+      "reward_std": 0.3308701366186142,
+      "rewards/GDino": 0.6453125327825546,
+      "rewards/GIT": 0.44731535762548447,
+      "rewards/HPSv2": 0.26494789123535156,
+      "rewards/ORM": 0.7765165567398071,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -20.6875,
+      "step": 155
+    },
+    {
+      "completion_length": 41.078125,
+      "epoch": 0.17275747508305647,
+      "grad_norm": 0.5353882908821106,
+      "kl": 0.010894775390625,
+      "learning_rate": 9.024999999999999e-07,
+      "loss": 0.0037739332765340805,
+      "reward": 1.8532127141952515,
+      "reward_std": 0.3209614157676697,
+      "rewards/GDino": 0.6619158685207367,
+      "rewards/GIT": 0.23281337320804596,
+      "rewards/HPSv2": 0.2831287384033203,
+      "rewards/ORM": 0.6753546595573425,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.125,
+      "step": 156
+    },
+    {
+      "completion_length": 54.296875,
+      "epoch": 0.17386489479512734,
+      "grad_norm": 0.45864060521125793,
+      "kl": 0.0081787109375,
+      "learning_rate": 9.018749999999999e-07,
+      "loss": 0.01829966064542532,
+      "reward": 2.4461565017700195,
+      "reward_std": 0.3625805824995041,
+      "rewards/GDino": 0.8224999904632568,
+      "rewards/GIT": 0.7057079672813416,
+      "rewards/HPSv2": 0.24219322204589844,
+      "rewards/ORM": 0.6757553368806839,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.8125,
+      "step": 157
+    },
+    {
+      "completion_length": 35.96875,
+      "epoch": 0.17497231450719822,
+      "grad_norm": 0.6962554454803467,
+      "kl": 0.01031494140625,
+      "learning_rate": 9.0125e-07,
+      "loss": 0.018664106726646423,
+      "reward": 2.2535247802734375,
+      "reward_std": 0.3470146059989929,
+      "rewards/GDino": 0.8081650137901306,
+      "rewards/GIT": 0.43000543117523193,
+      "rewards/HPSv2": 0.2705402374267578,
+      "rewards/ORM": 0.7448140382766724,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.375,
+      "step": 158
+    },
+    {
+      "completion_length": 48.421875,
+      "epoch": 0.1760797342192691,
+      "grad_norm": 0.7126027941703796,
+      "kl": 0.0084686279296875,
+      "learning_rate": 9.00625e-07,
+      "loss": -0.01690885704010725,
+      "reward": 2.103760838508606,
+      "reward_std": 0.42456358671188354,
+      "rewards/GDino": 0.7740625143051147,
+      "rewards/GIT": 0.5006528943777084,
+      "rewards/HPSv2": 0.2603874206542969,
+      "rewards/ORM": 0.5686581134796143,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -20.8125,
+      "step": 159
+    },
+    {
+      "completion_length": 47.15625,
+      "epoch": 0.17718715393134,
+      "grad_norm": 0.4899819791316986,
+      "kl": 0.011016845703125,
+      "learning_rate": 9e-07,
+      "loss": 0.011189845390617847,
+      "reward": 1.9280957579612732,
+      "reward_std": 0.3808829113841057,
+      "rewards/GDino": 0.7389523684978485,
+      "rewards/GIT": 0.29086190462112427,
+      "rewards/HPSv2": 0.27908897399902344,
+      "rewards/ORM": 0.6191926002502441,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.875,
+      "step": 160
+    },
+    {
+      "completion_length": 48.0,
+      "epoch": 0.17829457364341086,
+      "grad_norm": 0.8614944815635681,
+      "kl": 0.010162353515625,
+      "learning_rate": 8.99375e-07,
+      "loss": -0.0047345394268631935,
+      "reward": 2.126296818256378,
+      "reward_std": 0.4349767565727234,
+      "rewards/GDino": 0.6996158957481384,
+      "rewards/GIT": 0.4218016564846039,
+      "rewards/HPSv2": 0.26108741760253906,
+      "rewards/ORM": 0.743791937828064,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -20.75,
+      "step": 161
+    },
+    {
+      "completion_length": 57.40625,
+      "epoch": 0.17940199335548174,
+      "grad_norm": 0.4764331877231598,
+      "kl": 0.00958251953125,
+      "learning_rate": 8.9875e-07,
+      "loss": -0.02189162978902459,
+      "reward": 2.1861079335212708,
+      "reward_std": 0.2511523813009262,
+      "rewards/GDino": 0.9083333313465118,
+      "rewards/GIT": 0.5160972326993942,
+      "rewards/HPSv2": 0.2581977844238281,
+      "rewards/ORM": 0.5034796893596649,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.5,
+      "step": 162
+    },
+    {
+      "completion_length": 41.796875,
+      "epoch": 0.1805094130675526,
+      "grad_norm": 0.63126540184021,
+      "kl": 0.01348876953125,
+      "learning_rate": 8.981249999999999e-07,
+      "loss": -0.010512399720028043,
+      "reward": 2.1882660388946533,
+      "reward_std": 0.26342111825942993,
+      "rewards/GDino": 0.7701247036457062,
+      "rewards/GIT": 0.637158066034317,
+      "rewards/HPSv2": 0.24700498580932617,
+      "rewards/ORM": 0.5339783430099487,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -20.625,
+      "step": 163
+    },
+    {
+      "completion_length": 43.78125,
+      "epoch": 0.18161683277962348,
+      "grad_norm": 0.4358772039413452,
+      "kl": 0.01190185546875,
+      "learning_rate": 8.974999999999999e-07,
+      "loss": -0.006003182148560882,
+      "reward": 2.6546283960342407,
+      "reward_std": 0.3610512763261795,
+      "rewards/GDino": 0.8648440539836884,
+      "rewards/GIT": 0.7706953585147858,
+      "rewards/HPSv2": 0.269439697265625,
+      "rewards/ORM": 0.749649316072464,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -20.6875,
+      "step": 164
+    },
+    {
+      "completion_length": 48.09375,
+      "epoch": 0.18272425249169436,
+      "grad_norm": 1.585694432258606,
+      "kl": 0.01043701171875,
+      "learning_rate": 8.96875e-07,
+      "loss": 8.579343557357788e-05,
+      "reward": 2.331842541694641,
+      "reward_std": 0.3782896548509598,
+      "rewards/GDino": 0.7984375357627869,
+      "rewards/GIT": 0.5101843625307083,
+      "rewards/HPSv2": 0.27196693420410156,
+      "rewards/ORM": 0.751253753900528,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.8125,
+      "step": 165
+    },
+    {
+      "completion_length": 45.34375,
+      "epoch": 0.18383167220376523,
+      "grad_norm": 0.45005306601524353,
+      "kl": 0.01312255859375,
+      "learning_rate": 8.9625e-07,
+      "loss": -0.006219237111508846,
+      "reward": 2.192749261856079,
+      "reward_std": 0.2947642654180527,
+      "rewards/GDino": 0.7351250052452087,
+      "rewards/GIT": 0.45546063780784607,
+      "rewards/HPSv2": 0.24868392944335938,
+      "rewards/ORM": 0.7534796893596649,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.25,
+      "step": 166
+    },
+    {
+      "completion_length": 43.015625,
+      "epoch": 0.1849390919158361,
+      "grad_norm": 0.5231027007102966,
+      "kl": 0.010772705078125,
+      "learning_rate": 8.95625e-07,
+      "loss": -0.007611713605001569,
+      "reward": 2.0297417044639587,
+      "reward_std": 0.4215656816959381,
+      "rewards/GDino": 0.7420200109481812,
+      "rewards/GIT": 0.5049543976783752,
+      "rewards/HPSv2": 0.26178741455078125,
+      "rewards/ORM": 0.5209799110889435,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.75,
+      "step": 167
+    },
+    {
+      "completion_length": 53.46875,
+      "epoch": 0.18604651162790697,
+      "grad_norm": 0.4332805871963501,
+      "kl": 0.0062408447265625,
+      "learning_rate": 8.95e-07,
+      "loss": -0.004800099181011319,
+      "reward": 2.05144202709198,
+      "reward_std": 0.3129463642835617,
+      "rewards/GDino": 0.7780522406101227,
+      "rewards/GIT": 0.5233069062232971,
+      "rewards/HPSv2": 0.2612953186035156,
+      "rewards/ORM": 0.4887876957654953,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -21.1875,
+      "step": 168
+    },
+    {
+      "completion_length": 37.90625,
+      "epoch": 0.18715393133997785,
+      "grad_norm": 0.45867836475372314,
+      "kl": 0.01751708984375,
+      "learning_rate": 8.94375e-07,
+      "loss": 0.004684945801272988,
+      "reward": 2.2036190032958984,
+      "reward_std": 0.26221713423728943,
+      "rewards/GDino": 0.8604569435119629,
+      "rewards/GIT": 0.4576933681964874,
+      "rewards/HPSv2": 0.28287315368652344,
+      "rewards/ORM": 0.6025954186916351,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.5,
+      "step": 169
+    },
+    {
+      "completion_length": 41.375,
+      "epoch": 0.18826135105204872,
+      "grad_norm": 0.4206317365169525,
+      "kl": 0.009979248046875,
+      "learning_rate": 8.9375e-07,
+      "loss": 0.0024775206111371517,
+      "reward": 1.943125069141388,
+      "reward_std": 0.4646635055541992,
+      "rewards/GDino": 0.7305906116962433,
+      "rewards/GIT": 0.4165241867303848,
+      "rewards/HPSv2": 0.24480247497558594,
+      "rewards/ORM": 0.5512078106403351,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.125,
+      "step": 170
+    },
+    {
+      "completion_length": 40.703125,
+      "epoch": 0.1893687707641196,
+      "grad_norm": 0.697102963924408,
+      "kl": 0.010833740234375,
+      "learning_rate": 8.931249999999999e-07,
+      "loss": -0.003134746104478836,
+      "reward": 2.203751564025879,
+      "reward_std": 0.3228776603937149,
+      "rewards/GDino": 0.7603735029697418,
+      "rewards/GIT": 0.5309787690639496,
+      "rewards/HPSv2": 0.2583751678466797,
+      "rewards/ORM": 0.654024064540863,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.1875,
+      "step": 171
+    },
+    {
+      "completion_length": 50.515625,
+      "epoch": 0.19047619047619047,
+      "grad_norm": 0.5617944002151489,
+      "kl": 0.008392333984375,
+      "learning_rate": 8.924999999999999e-07,
+      "loss": 0.03539674496278167,
+      "reward": 1.7965713739395142,
+      "reward_std": 0.3860231041908264,
+      "rewards/GDino": 0.6837728917598724,
+      "rewards/GIT": 0.4299345314502716,
+      "rewards/HPSv2": 0.2496967315673828,
+      "rewards/ORM": 0.4331671893596649,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.25,
+      "step": 172
+    },
+    {
+      "completion_length": 35.5625,
+      "epoch": 0.19158361018826134,
+      "grad_norm": 0.4585064649581909,
+      "kl": 0.017242431640625,
+      "learning_rate": 8.918749999999999e-07,
+      "loss": 0.002358448226004839,
+      "reward": 2.1448813676834106,
+      "reward_std": 0.31261830031871796,
+      "rewards/GDino": 0.834684431552887,
+      "rewards/GIT": 0.36106863617897034,
+      "rewards/HPSv2": 0.28739356994628906,
+      "rewards/ORM": 0.6617347896099091,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -22.5625,
+      "step": 173
+    },
+    {
+      "completion_length": 46.859375,
+      "epoch": 0.19269102990033224,
+      "grad_norm": 0.42562612891197205,
+      "kl": 0.00872802734375,
+      "learning_rate": 8.912499999999999e-07,
+      "loss": 0.0036601885221898556,
+      "reward": 2.2070860862731934,
+      "reward_std": 0.3847656697034836,
+      "rewards/GDino": 0.7723565697669983,
+      "rewards/GIT": 0.5430227518081665,
+      "rewards/HPSv2": 0.25701904296875,
+      "rewards/ORM": 0.634687751531601,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -20.875,
+      "step": 174
+    },
+    {
+      "completion_length": 46.0,
+      "epoch": 0.1937984496124031,
+      "grad_norm": 0.5808805227279663,
+      "kl": 0.0078277587890625,
+      "learning_rate": 8.906249999999999e-07,
+      "loss": 0.032253723591566086,
+      "reward": 2.157870888710022,
+      "reward_std": 0.2512262612581253,
+      "rewards/GDino": 0.7799479365348816,
+      "rewards/GIT": 0.5305047780275345,
+      "rewards/HPSv2": 0.2598762512207031,
+      "rewards/ORM": 0.587541937828064,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.125,
+      "step": 175
+    },
+    {
+      "completion_length": 58.375,
+      "epoch": 0.19490586932447398,
+      "grad_norm": 0.5789065957069397,
+      "kl": 0.0071868896484375,
+      "learning_rate": 8.9e-07,
+      "loss": -0.014507739804685116,
+      "reward": 1.955940306186676,
+      "reward_std": 0.3165567219257355,
+      "rewards/GDino": 0.7864583432674408,
+      "rewards/GIT": 0.4266613572835922,
+      "rewards/HPSv2": 0.25844573974609375,
+      "rewards/ORM": 0.484375,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.9375,
+      "step": 176
+    },
+    {
+      "completion_length": 51.328125,
+      "epoch": 0.19601328903654486,
+      "grad_norm": 1.309583067893982,
+      "kl": 0.013214111328125,
+      "learning_rate": 8.89375e-07,
+      "loss": -0.0020376548636704683,
+      "reward": 2.2406471967697144,
+      "reward_std": 0.3685739040374756,
+      "rewards/GDino": 0.7480616569519043,
+      "rewards/GIT": 0.4961736798286438,
+      "rewards/HPSv2": 0.25730323791503906,
+      "rewards/ORM": 0.7391084432601929,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -22.25,
+      "step": 177
+    },
+    {
+      "completion_length": 56.609375,
+      "epoch": 0.19712070874861573,
+      "grad_norm": 0.6938892602920532,
+      "kl": 0.008270263671875,
+      "learning_rate": 8.8875e-07,
+      "loss": 0.005612233653664589,
+      "reward": 2.1296470165252686,
+      "reward_std": 0.3742387443780899,
+      "rewards/GDino": 0.7609374821186066,
+      "rewards/GIT": 0.4125446677207947,
+      "rewards/HPSv2": 0.2701892852783203,
+      "rewards/ORM": 0.6859754621982574,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.125,
+      "step": 178
+    },
+    {
+      "completion_length": 37.234375,
+      "epoch": 0.1982281284606866,
+      "grad_norm": 0.4790467917919159,
+      "kl": 0.006927490234375,
+      "learning_rate": 8.88125e-07,
+      "loss": 0.01884503196924925,
+      "reward": 2.431138515472412,
+      "reward_std": 0.24652785062789917,
+      "rewards/GDino": 0.8910974562168121,
+      "rewards/GIT": 0.5796742737293243,
+      "rewards/HPSv2": 0.28255462646484375,
+      "rewards/ORM": 0.677812248468399,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -22.375,
+      "step": 179
+    },
+    {
+      "completion_length": 45.921875,
+      "epoch": 0.19933554817275748,
+      "grad_norm": 0.4324551522731781,
+      "kl": 0.011749267578125,
+      "learning_rate": 8.874999999999999e-07,
+      "loss": -0.016566987615078688,
+      "reward": 1.7641431093215942,
+      "reward_std": 0.42914003133773804,
+      "rewards/GDino": 0.7295474410057068,
+      "rewards/GIT": 0.32154107093811035,
+      "rewards/HPSv2": 0.25917816162109375,
+      "rewards/ORM": 0.4538763463497162,
+      "self_certainty_semantic": -25.125,
+      "self_certainty_token": -21.5,
+      "step": 180
+    },
+    {
+      "completion_length": 44.921875,
+      "epoch": 0.20044296788482835,
+      "grad_norm": 0.7271856665611267,
+      "kl": 0.008270263671875,
+      "learning_rate": 8.86875e-07,
+      "loss": -0.01033696997910738,
+      "reward": 1.8466984629631042,
+      "reward_std": 0.33254362642765045,
+      "rewards/GDino": 0.6390625238418579,
+      "rewards/GIT": 0.35668525099754333,
+      "rewards/HPSv2": 0.27863311767578125,
+      "rewards/ORM": 0.5723176002502441,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.125,
+      "step": 181
+    },
+    {
+      "completion_length": 40.84375,
+      "epoch": 0.20155038759689922,
+      "grad_norm": 0.5530015230178833,
+      "kl": 0.02142333984375,
+      "learning_rate": 8.8625e-07,
+      "loss": -0.008886129595339298,
+      "reward": 1.7842278480529785,
+      "reward_std": 0.4365523010492325,
+      "rewards/GDino": 0.7104989886283875,
+      "rewards/GIT": 0.34373709559440613,
+      "rewards/HPSv2": 0.27834129333496094,
+      "rewards/ORM": 0.4516504108905792,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -22.0,
+      "step": 182
+    },
+    {
+      "completion_length": 51.34375,
+      "epoch": 0.2026578073089701,
+      "grad_norm": 0.545352041721344,
+      "kl": 0.015411376953125,
+      "learning_rate": 8.85625e-07,
+      "loss": -0.010915862862020731,
+      "reward": 1.9055233001708984,
+      "reward_std": 0.38334617018699646,
+      "rewards/GDino": 0.7308869063854218,
+      "rewards/GIT": 0.3478083163499832,
+      "rewards/HPSv2": 0.2643280029296875,
+      "rewards/ORM": 0.5625000298023224,
+      "self_certainty_semantic": -25.25,
+      "self_certainty_token": -21.5,
+      "step": 183
+    },
+    {
+      "completion_length": 38.890625,
+      "epoch": 0.20376522702104097,
+      "grad_norm": 0.5411937832832336,
+      "kl": 0.01177978515625,
+      "learning_rate": 8.85e-07,
+      "loss": 0.013757664943113923,
+      "reward": 2.716238021850586,
+      "reward_std": 0.29777073860168457,
+      "rewards/GDino": 0.925000011920929,
+      "rewards/GIT": 0.7263616919517517,
+      "rewards/HPSv2": 0.2595672607421875,
+      "rewards/ORM": 0.805308997631073,
+      "self_certainty_semantic": -25.0,
+      "self_certainty_token": -21.4375,
+      "step": 184
+    },
+    {
+      "completion_length": 41.65625,
+      "epoch": 0.20487264673311184,
+      "grad_norm": 1.4033399820327759,
+      "kl": 0.016143798828125,
+      "learning_rate": 8.84375e-07,
+      "loss": 0.016424793750047684,
+      "reward": 2.2067692279815674,
+      "reward_std": 0.44875267148017883,
+      "rewards/GDino": 0.8457056879997253,
+      "rewards/GIT": 0.31159064173698425,
+      "rewards/HPSv2": 0.275634765625,
+      "rewards/ORM": 0.7738381326198578,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -22.375,
+      "step": 185
+    },
+    {
+      "completion_length": 40.21875,
+      "epoch": 0.2059800664451827,
+      "grad_norm": 0.545001208782196,
+      "kl": 0.0111083984375,
+      "learning_rate": 8.8375e-07,
+      "loss": 0.010479988530278206,
+      "reward": 1.9079334735870361,
+      "reward_std": 0.3735552281141281,
+      "rewards/GDino": 0.6260845363140106,
+      "rewards/GIT": 0.33919139206409454,
+      "rewards/HPSv2": 0.2270364761352539,
+      "rewards/ORM": 0.7156209945678711,
+      "self_certainty_semantic": -24.9375,
+      "self_certainty_token": -21.375,
+      "step": 186
+    },
+    {
+      "completion_length": 42.25,
+      "epoch": 0.2070874861572536,
+      "grad_norm": 0.5201876759529114,
+      "kl": 0.012786865234375,
+      "learning_rate": 8.83125e-07,
+      "loss": -0.005474693141877651,
+      "reward": 2.160323202610016,
+      "reward_std": 0.380416139960289,
+      "rewards/GDino": 0.8049721121788025,
+      "rewards/GIT": 0.5423709452152252,
+      "rewards/HPSv2": 0.25914573669433594,
+      "rewards/ORM": 0.5538343787193298,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -20.625,
+      "step": 187
+    },
+    {
+      "completion_length": 41.546875,
+      "epoch": 0.2081949058693245,
+      "grad_norm": 1.0598704814910889,
+      "kl": 0.0123291015625,
+      "learning_rate": 8.824999999999999e-07,
+      "loss": -0.005072480300441384,
+      "reward": 2.198424220085144,
+      "reward_std": 0.318149596452713,
+      "rewards/GDino": 0.77506023645401,
+      "rewards/GIT": 0.2563297525048256,
+      "rewards/HPSv2": 0.26827430725097656,
+      "rewards/ORM": 0.8987601101398468,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -21.6875,
+      "step": 188
+    },
+    {
+      "completion_length": 45.859375,
+      "epoch": 0.20930232558139536,
+      "grad_norm": 1.0103754997253418,
+      "kl": 0.0113983154296875,
+      "learning_rate": 8.818749999999999e-07,
+      "loss": 0.006917888764292002,
+      "reward": 2.355333089828491,
+      "reward_std": 0.26609520614147186,
+      "rewards/GDino": 0.7847018539905548,
+      "rewards/GIT": 0.5574060827493668,
+      "rewards/HPSv2": 0.2811260223388672,
+      "rewards/ORM": 0.7320991158485413,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -20.8125,
+      "step": 189
+    },
+    {
+      "completion_length": 43.21875,
+      "epoch": 0.21040974529346623,
+      "grad_norm": 1.6472647190093994,
+      "kl": 0.021514892578125,
+      "learning_rate": 8.812499999999999e-07,
+      "loss": 0.024224724620580673,
+      "reward": 2.3357620239257812,
+      "reward_std": 0.36145227402448654,
+      "rewards/GDino": 0.9042215049266815,
+      "rewards/GIT": 0.44460529088974,
+      "rewards/HPSv2": 0.26693153381347656,
+      "rewards/ORM": 0.720003753900528,
+      "self_certainty_semantic": -25.0625,
+      "self_certainty_token": -21.125,
+      "step": 190
+    },
+    {
+      "completion_length": 39.84375,
+      "epoch": 0.2115171650055371,
+      "grad_norm": 0.4037325084209442,
+      "kl": 0.010467529296875,
+      "learning_rate": 8.806249999999999e-07,
+      "loss": 0.011584978085011244,
+      "reward": 2.569461226463318,
+      "reward_std": 0.31772294640541077,
+      "rewards/GDino": 0.8936654925346375,
+      "rewards/GIT": 0.6064814329147339,
+      "rewards/HPSv2": 0.28333091735839844,
+      "rewards/ORM": 0.7859834432601929,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.75,
+      "step": 191
+    },
+    {
+      "completion_length": 54.203125,
+      "epoch": 0.21262458471760798,
+      "grad_norm": 1.364973545074463,
+      "kl": 0.02630615234375,
+      "learning_rate": 8.799999999999999e-07,
+      "loss": -0.045025499537587166,
+      "reward": 1.7345000505447388,
+      "reward_std": 0.2504274845123291,
+      "rewards/GDino": 0.6147945821285248,
+      "rewards/GIT": 0.4107672870159149,
+      "rewards/HPSv2": 0.24933433532714844,
+      "rewards/ORM": 0.45960381627082825,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.4375,
+      "step": 192
+    },
+    {
+      "completion_length": 41.25,
+      "epoch": 0.21373200442967885,
+      "grad_norm": 0.4461517333984375,
+      "kl": 0.0135498046875,
+      "learning_rate": 8.793749999999999e-07,
+      "loss": -0.006084040272980928,
+      "reward": 1.8086916208267212,
+      "reward_std": 0.4172170013189316,
+      "rewards/GDino": 0.6766185760498047,
+      "rewards/GIT": 0.20515873283147812,
+      "rewards/HPSv2": 0.2862892150878906,
+      "rewards/ORM": 0.640625,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.875,
+      "step": 193
+    },
+    {
+      "completion_length": 46.046875,
+      "epoch": 0.21483942414174972,
+      "grad_norm": 0.6739068031311035,
+      "kl": 0.00982666015625,
+      "learning_rate": 8.7875e-07,
+      "loss": 0.005553322844207287,
+      "reward": 2.0827959775924683,
+      "reward_std": 0.38654056191444397,
+      "rewards/GDino": 0.8122715353965759,
+      "rewards/GIT": 0.5369465202093124,
+      "rewards/HPSv2": 0.2711658477783203,
+      "rewards/ORM": 0.46241210401058197,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -20.875,
+      "step": 194
+    },
+    {
+      "completion_length": 41.984375,
+      "epoch": 0.2159468438538206,
+      "grad_norm": 0.5966407060623169,
+      "kl": 0.01617431640625,
+      "learning_rate": 8.78125e-07,
+      "loss": 0.006598036969080567,
+      "reward": 2.0439035892486572,
+      "reward_std": 0.28816351294517517,
+      "rewards/GDino": 0.7164062261581421,
+      "rewards/GIT": 0.583609938621521,
+      "rewards/HPSv2": 0.27428436279296875,
+      "rewards/ORM": 0.46960312128067017,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.5,
+      "step": 195
+    },
+    {
+      "completion_length": 52.09375,
+      "epoch": 0.21705426356589147,
+      "grad_norm": 0.4805348217487335,
+      "kl": 0.007720947265625,
+      "learning_rate": 8.774999999999999e-07,
+      "loss": -0.005165549926459789,
+      "reward": 2.242133378982544,
+      "reward_std": 0.2688770145177841,
+      "rewards/GDino": 0.7020833194255829,
+      "rewards/GIT": 0.539276048541069,
+      "rewards/HPSv2": 0.2836284637451172,
+      "rewards/ORM": 0.7171455323696136,
+      "self_certainty_semantic": -25.375,
+      "self_certainty_token": -21.625,
+      "step": 196
+    },
+    {
+      "completion_length": 43.796875,
+      "epoch": 0.21816168327796234,
+      "grad_norm": 0.635823667049408,
+      "kl": 0.01513671875,
+      "learning_rate": 8.76875e-07,
+      "loss": -0.0036483434960246086,
+      "reward": 2.2711516618728638,
+      "reward_std": 0.4390978515148163,
+      "rewards/GDino": 0.6989583671092987,
+      "rewards/GIT": 0.5045495182275772,
+      "rewards/HPSv2": 0.25634765625,
+      "rewards/ORM": 0.8112961947917938,
+      "self_certainty_semantic": -25.3125,
+      "self_certainty_token": -20.875,
+      "step": 197
+    },
+    {
+      "completion_length": 42.03125,
+      "epoch": 0.21926910299003322,
+      "grad_norm": 1.2338166236877441,
+      "kl": 0.012359619140625,
+      "learning_rate": 8.7625e-07,
+      "loss": 0.007895001443102956,
+      "reward": 2.048970103263855,
+      "reward_std": 0.40481944382190704,
+      "rewards/GDino": 0.8076697587966919,
+      "rewards/GIT": 0.5579419136047363,
+      "rewards/HPSv2": 0.2553253173828125,
+      "rewards/ORM": 0.42803309112787247,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.875,
+      "step": 198
+    },
+    {
+      "completion_length": 45.390625,
+      "epoch": 0.2203765227021041,
+      "grad_norm": 0.5174763798713684,
+      "kl": 0.01416015625,
+      "learning_rate": 8.75625e-07,
+      "loss": 0.016770444810390472,
+      "reward": 2.5844353437423706,
+      "reward_std": 0.2750231549143791,
+      "rewards/GDino": 0.8541243076324463,
+      "rewards/GIT": 0.6642543226480484,
+      "rewards/HPSv2": 0.25846099853515625,
+      "rewards/ORM": 0.8075956404209137,
+      "self_certainty_semantic": -25.1875,
+      "self_certainty_token": -21.875,
+      "step": 199
+    },
+    {
+      "completion_length": 36.875,
+      "epoch": 0.22148394241417496,
+      "grad_norm": 0.509035587310791,
+      "kl": 0.01654052734375,
+      "learning_rate": 8.75e-07,
+      "loss": 0.006852276623249054,
+      "reward": 2.065316915512085,
+      "reward_std": 0.3316378742456436,
+      "rewards/GDino": 0.7682685256004333,
+      "rewards/GIT": 0.2910846248269081,
+      "rewards/HPSv2": 0.2672557830810547,
+      "rewards/ORM": 0.7387078404426575,
+      "self_certainty_semantic": -25.4375,
+      "self_certainty_token": -21.625,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1600,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}