{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.38278977185729596, "eval_steps": 10, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001531159087429184, "grad_norm": 270.11077880859375, "learning_rate": 4.996172102281427e-06, "loss": 1.2137, "step": 10 }, { "epoch": 0.001531159087429184, "eval_accuracy": 0.5366216907106497, "eval_loss": 1.2146648168563843, "eval_runtime": 277.3281, "eval_samples_per_second": 162.796, "eval_steps_per_second": 20.351, "step": 10 }, { "epoch": 0.003062318174858368, "grad_norm": 69.96483612060547, "learning_rate": 4.9923442045628545e-06, "loss": 0.8743, "step": 20 }, { "epoch": 0.003062318174858368, "eval_accuracy": 0.5232508865445948, "eval_loss": 1.02613365650177, "eval_runtime": 276.3955, "eval_samples_per_second": 163.346, "eval_steps_per_second": 20.42, "step": 20 }, { "epoch": 0.004593477262287551, "grad_norm": 120.09966278076172, "learning_rate": 4.988516306844281e-06, "loss": 1.0324, "step": 30 }, { "epoch": 0.004593477262287551, "eval_accuracy": 0.5467704523151039, "eval_loss": 0.9261217713356018, "eval_runtime": 277.0496, "eval_samples_per_second": 162.96, "eval_steps_per_second": 20.372, "step": 30 }, { "epoch": 0.006124636349716736, "grad_norm": 76.13516998291016, "learning_rate": 4.984688409125709e-06, "loss": 0.9529, "step": 40 }, { "epoch": 0.006124636349716736, "eval_accuracy": 0.5361401310932641, "eval_loss": 0.8699733018875122, "eval_runtime": 276.196, "eval_samples_per_second": 163.464, "eval_steps_per_second": 20.435, "step": 40 }, { "epoch": 0.007655795437145919, "grad_norm": 51.14548110961914, "learning_rate": 4.980860511407135e-06, "loss": 0.8279, "step": 50 }, { "epoch": 0.007655795437145919, "eval_accuracy": 0.5364695340501792, "eval_loss": 0.8454414010047913, "eval_runtime": 277.7047, "eval_samples_per_second": 162.576, "eval_steps_per_second": 20.324, "step": 50 }, { "epoch": 0.009186954524575103, "grad_norm": 29.365215301513672, "learning_rate": 4.977032613688563e-06, "loss": 0.6843, "step": 60 }, { "epoch": 0.009186954524575103, "eval_accuracy": 0.5485189652861546, "eval_loss": 0.8400074243545532, "eval_runtime": 276.2282, "eval_samples_per_second": 163.445, "eval_steps_per_second": 20.432, "step": 60 }, { "epoch": 0.010718113612004287, "grad_norm": 167.26617431640625, "learning_rate": 4.9732047159699895e-06, "loss": 0.8245, "step": 70 }, { "epoch": 0.010718113612004287, "eval_accuracy": 0.5589761867634938, "eval_loss": 1.03753662109375, "eval_runtime": 280.6426, "eval_samples_per_second": 160.874, "eval_steps_per_second": 20.111, "step": 70 }, { "epoch": 0.012249272699433471, "grad_norm": 104.5846939086914, "learning_rate": 4.969376818251417e-06, "loss": 0.8412, "step": 80 }, { "epoch": 0.012249272699433471, "eval_accuracy": 0.5760374802939805, "eval_loss": 0.7869167923927307, "eval_runtime": 281.2402, "eval_samples_per_second": 160.532, "eval_steps_per_second": 20.068, "step": 80 }, { "epoch": 0.013780431786862656, "grad_norm": 35.11406326293945, "learning_rate": 4.965548920532844e-06, "loss": 0.8279, "step": 90 }, { "epoch": 0.013780431786862656, "eval_accuracy": 0.5787423483583751, "eval_loss": 0.724981963634491, "eval_runtime": 277.5138, "eval_samples_per_second": 162.687, "eval_steps_per_second": 20.338, "step": 90 }, { "epoch": 0.015311590874291839, "grad_norm": 25.292316436767578, "learning_rate": 4.961721022814271e-06, "loss": 0.6736, "step": 100 }, { "epoch": 0.015311590874291839, "eval_accuracy": 0.5338034748948856, "eval_loss": 0.8150990009307861, "eval_runtime": 278.062, "eval_samples_per_second": 162.367, "eval_steps_per_second": 20.298, "step": 100 }, { "epoch": 0.016842749961721023, "grad_norm": 21.010047912597656, "learning_rate": 4.957893125095698e-06, "loss": 0.729, "step": 110 }, { "epoch": 0.016842749961721023, "eval_accuracy": 0.5677641824249166, "eval_loss": 0.833014965057373, "eval_runtime": 278.6412, "eval_samples_per_second": 162.029, "eval_steps_per_second": 20.255, "step": 110 }, { "epoch": 0.018373909049150206, "grad_norm": 29.881685256958008, "learning_rate": 4.954065227377125e-06, "loss": 0.771, "step": 120 }, { "epoch": 0.018373909049150206, "eval_accuracy": 0.5731047331765958, "eval_loss": 0.7811030745506287, "eval_runtime": 276.9087, "eval_samples_per_second": 163.043, "eval_steps_per_second": 20.382, "step": 120 }, { "epoch": 0.01990506813657939, "grad_norm": 15.595231056213379, "learning_rate": 4.950237329658552e-06, "loss": 0.7783, "step": 130 }, { "epoch": 0.01990506813657939, "eval_accuracy": 0.5626015898109594, "eval_loss": 0.7167708873748779, "eval_runtime": 277.1636, "eval_samples_per_second": 162.893, "eval_steps_per_second": 20.363, "step": 130 }, { "epoch": 0.021436227224008574, "grad_norm": 22.25090789794922, "learning_rate": 4.946409431939979e-06, "loss": 0.6794, "step": 140 }, { "epoch": 0.021436227224008574, "eval_accuracy": 0.5578002997964069, "eval_loss": 0.7202938795089722, "eval_runtime": 278.0355, "eval_samples_per_second": 162.382, "eval_steps_per_second": 20.3, "step": 140 }, { "epoch": 0.022967386311437757, "grad_norm": 37.13331604003906, "learning_rate": 4.942581534221406e-06, "loss": 0.7133, "step": 150 }, { "epoch": 0.022967386311437757, "eval_accuracy": 0.5772544642857143, "eval_loss": 0.7133862972259521, "eval_runtime": 277.8264, "eval_samples_per_second": 162.504, "eval_steps_per_second": 20.315, "step": 150 }, { "epoch": 0.024498545398866943, "grad_norm": 28.552654266357422, "learning_rate": 4.9387536365028335e-06, "loss": 0.7334, "step": 160 }, { "epoch": 0.024498545398866943, "eval_accuracy": 0.5714637421665174, "eval_loss": 0.7050605416297913, "eval_runtime": 277.1231, "eval_samples_per_second": 162.917, "eval_steps_per_second": 20.366, "step": 160 }, { "epoch": 0.026029704486296126, "grad_norm": 18.05495834350586, "learning_rate": 4.93492573878426e-06, "loss": 0.7056, "step": 170 }, { "epoch": 0.026029704486296126, "eval_accuracy": 0.5888312385382655, "eval_loss": 0.6785433888435364, "eval_runtime": 275.2348, "eval_samples_per_second": 164.034, "eval_steps_per_second": 20.506, "step": 170 }, { "epoch": 0.02756086357372531, "grad_norm": 14.354559898376465, "learning_rate": 4.931097841065688e-06, "loss": 0.6317, "step": 180 }, { "epoch": 0.02756086357372531, "eval_accuracy": 0.5905926454837558, "eval_loss": 0.6833683252334595, "eval_runtime": 278.3913, "eval_samples_per_second": 162.175, "eval_steps_per_second": 20.274, "step": 180 }, { "epoch": 0.029092022661154494, "grad_norm": 10.222257614135742, "learning_rate": 4.927269943347114e-06, "loss": 0.6975, "step": 190 }, { "epoch": 0.029092022661154494, "eval_accuracy": 0.5688126660860633, "eval_loss": 0.6987228989601135, "eval_runtime": 278.0516, "eval_samples_per_second": 162.373, "eval_steps_per_second": 20.298, "step": 190 }, { "epoch": 0.030623181748583677, "grad_norm": 19.388473510742188, "learning_rate": 4.923442045628542e-06, "loss": 0.675, "step": 200 }, { "epoch": 0.030623181748583677, "eval_accuracy": 0.5726792755463477, "eval_loss": 0.7145602107048035, "eval_runtime": 280.8737, "eval_samples_per_second": 160.741, "eval_steps_per_second": 20.094, "step": 200 }, { "epoch": 0.03215434083601286, "grad_norm": 21.06436538696289, "learning_rate": 4.919614147909968e-06, "loss": 0.6576, "step": 210 }, { "epoch": 0.03215434083601286, "eval_accuracy": 0.5798941563639598, "eval_loss": 0.7224599123001099, "eval_runtime": 276.3195, "eval_samples_per_second": 163.391, "eval_steps_per_second": 20.426, "step": 210 }, { "epoch": 0.033685499923442046, "grad_norm": 14.679203987121582, "learning_rate": 4.915786250191396e-06, "loss": 0.631, "step": 220 }, { "epoch": 0.033685499923442046, "eval_accuracy": 0.5863426131815351, "eval_loss": 0.6974319815635681, "eval_runtime": 275.464, "eval_samples_per_second": 163.898, "eval_steps_per_second": 20.489, "step": 220 }, { "epoch": 0.03521665901087123, "grad_norm": 22.14579963684082, "learning_rate": 4.9119583524728225e-06, "loss": 0.7267, "step": 230 }, { "epoch": 0.03521665901087123, "eval_accuracy": 0.5820226223144368, "eval_loss": 0.7019667625427246, "eval_runtime": 276.7093, "eval_samples_per_second": 163.16, "eval_steps_per_second": 20.397, "step": 230 }, { "epoch": 0.03674781809830041, "grad_norm": 7.788635730743408, "learning_rate": 4.908130454754249e-06, "loss": 0.6079, "step": 240 }, { "epoch": 0.03674781809830041, "eval_accuracy": 0.5955016145195412, "eval_loss": 0.6954487562179565, "eval_runtime": 279.6934, "eval_samples_per_second": 161.42, "eval_steps_per_second": 20.179, "step": 240 }, { "epoch": 0.0382789771857296, "grad_norm": 10.69842529296875, "learning_rate": 4.904302557035676e-06, "loss": 0.6862, "step": 250 }, { "epoch": 0.0382789771857296, "eval_accuracy": 0.5917668001780151, "eval_loss": 0.7000778913497925, "eval_runtime": 279.4881, "eval_samples_per_second": 161.538, "eval_steps_per_second": 20.194, "step": 250 }, { "epoch": 0.03981013627315878, "grad_norm": 11.247802734375, "learning_rate": 4.900474659317103e-06, "loss": 0.6138, "step": 260 }, { "epoch": 0.03981013627315878, "eval_accuracy": 0.5918217597225002, "eval_loss": 0.7174475789070129, "eval_runtime": 279.259, "eval_samples_per_second": 161.671, "eval_steps_per_second": 20.211, "step": 260 }, { "epoch": 0.04134129536058796, "grad_norm": 14.550101280212402, "learning_rate": 4.89664676159853e-06, "loss": 0.7057, "step": 270 }, { "epoch": 0.04134129536058796, "eval_accuracy": 0.5835486457590877, "eval_loss": 0.7064123749732971, "eval_runtime": 279.3626, "eval_samples_per_second": 161.611, "eval_steps_per_second": 20.203, "step": 270 }, { "epoch": 0.04287245444801715, "grad_norm": 10.610974311828613, "learning_rate": 4.8928188638799574e-06, "loss": 0.6402, "step": 280 }, { "epoch": 0.04287245444801715, "eval_accuracy": 0.565008347245409, "eval_loss": 0.7342826128005981, "eval_runtime": 276.5673, "eval_samples_per_second": 163.244, "eval_steps_per_second": 20.407, "step": 280 }, { "epoch": 0.044403613535446335, "grad_norm": 16.33481216430664, "learning_rate": 4.888990966161384e-06, "loss": 0.6989, "step": 290 }, { "epoch": 0.044403613535446335, "eval_accuracy": 0.5488992095744206, "eval_loss": 0.7351524829864502, "eval_runtime": 277.0818, "eval_samples_per_second": 162.941, "eval_steps_per_second": 20.369, "step": 290 }, { "epoch": 0.045934772622875514, "grad_norm": 7.8038530349731445, "learning_rate": 4.8851630684428116e-06, "loss": 0.8763, "step": 300 }, { "epoch": 0.045934772622875514, "eval_accuracy": 0.5477137400292102, "eval_loss": 0.7000990509986877, "eval_runtime": 276.7186, "eval_samples_per_second": 163.155, "eval_steps_per_second": 20.396, "step": 300 }, { "epoch": 0.0474659317103047, "grad_norm": 7.672823429107666, "learning_rate": 4.881335170724238e-06, "loss": 0.6821, "step": 310 }, { "epoch": 0.0474659317103047, "eval_accuracy": 0.5508064880495188, "eval_loss": 0.6869771480560303, "eval_runtime": 276.0483, "eval_samples_per_second": 163.551, "eval_steps_per_second": 20.446, "step": 310 }, { "epoch": 0.048997090797733886, "grad_norm": 6.58916711807251, "learning_rate": 4.877507273005666e-06, "loss": 0.6771, "step": 320 }, { "epoch": 0.048997090797733886, "eval_accuracy": 0.5899522937439237, "eval_loss": 0.6720254421234131, "eval_runtime": 276.7493, "eval_samples_per_second": 163.137, "eval_steps_per_second": 20.394, "step": 320 }, { "epoch": 0.050528249885163065, "grad_norm": 5.778668403625488, "learning_rate": 4.873679375287092e-06, "loss": 0.6747, "step": 330 }, { "epoch": 0.050528249885163065, "eval_accuracy": 0.59793675409615, "eval_loss": 0.662046492099762, "eval_runtime": 275.7574, "eval_samples_per_second": 163.724, "eval_steps_per_second": 20.467, "step": 330 }, { "epoch": 0.05205940897259225, "grad_norm": 6.639097690582275, "learning_rate": 4.86985147756852e-06, "loss": 0.6283, "step": 340 }, { "epoch": 0.05205940897259225, "eval_accuracy": 0.5966143497757848, "eval_loss": 0.6647851467132568, "eval_runtime": 274.2604, "eval_samples_per_second": 164.617, "eval_steps_per_second": 20.579, "step": 340 }, { "epoch": 0.05359056806002144, "grad_norm": 7.865772724151611, "learning_rate": 4.8660235798499465e-06, "loss": 0.6396, "step": 350 }, { "epoch": 0.05359056806002144, "eval_accuracy": 0.5950059134626113, "eval_loss": 0.6750874519348145, "eval_runtime": 275.118, "eval_samples_per_second": 164.104, "eval_steps_per_second": 20.515, "step": 350 }, { "epoch": 0.05512172714745062, "grad_norm": 12.208525657653809, "learning_rate": 4.862195682131374e-06, "loss": 0.6802, "step": 360 }, { "epoch": 0.05512172714745062, "eval_accuracy": 0.5931904836228232, "eval_loss": 0.6721886992454529, "eval_runtime": 276.9399, "eval_samples_per_second": 163.025, "eval_steps_per_second": 20.38, "step": 360 }, { "epoch": 0.0566528862348798, "grad_norm": 21.355411529541016, "learning_rate": 4.858367784412801e-06, "loss": 0.7074, "step": 370 }, { "epoch": 0.0566528862348798, "eval_accuracy": 0.5865997770345597, "eval_loss": 0.6712462306022644, "eval_runtime": 277.2497, "eval_samples_per_second": 162.842, "eval_steps_per_second": 20.357, "step": 370 }, { "epoch": 0.05818404532230899, "grad_norm": 12.31411075592041, "learning_rate": 4.854539886694228e-06, "loss": 0.7007, "step": 380 }, { "epoch": 0.05818404532230899, "eval_accuracy": 0.5599766601584416, "eval_loss": 0.6877785921096802, "eval_runtime": 278.0283, "eval_samples_per_second": 162.386, "eval_steps_per_second": 20.3, "step": 380 }, { "epoch": 0.059715204409738175, "grad_norm": 8.385506629943848, "learning_rate": 4.850711988975655e-06, "loss": 0.6589, "step": 390 }, { "epoch": 0.059715204409738175, "eval_accuracy": 0.5800094486063305, "eval_loss": 0.6738844513893127, "eval_runtime": 279.2165, "eval_samples_per_second": 161.695, "eval_steps_per_second": 20.214, "step": 390 }, { "epoch": 0.061246363497167354, "grad_norm": 9.52385139465332, "learning_rate": 4.846884091257082e-06, "loss": 0.6144, "step": 400 }, { "epoch": 0.061246363497167354, "eval_accuracy": 0.5881171772160372, "eval_loss": 0.6737349033355713, "eval_runtime": 277.7475, "eval_samples_per_second": 162.551, "eval_steps_per_second": 20.321, "step": 400 }, { "epoch": 0.06277752258459654, "grad_norm": 9.535078048706055, "learning_rate": 4.843056193538509e-06, "loss": 0.6653, "step": 410 }, { "epoch": 0.06277752258459654, "eval_accuracy": 0.5889365121885882, "eval_loss": 0.6895773410797119, "eval_runtime": 277.0672, "eval_samples_per_second": 162.95, "eval_steps_per_second": 20.371, "step": 410 }, { "epoch": 0.06430868167202572, "grad_norm": 13.276960372924805, "learning_rate": 4.839228295819936e-06, "loss": 0.6033, "step": 420 }, { "epoch": 0.06430868167202572, "eval_accuracy": 0.5839217088211637, "eval_loss": 0.7118301391601562, "eval_runtime": 277.1875, "eval_samples_per_second": 162.879, "eval_steps_per_second": 20.362, "step": 420 }, { "epoch": 0.06583984075945491, "grad_norm": 11.221186637878418, "learning_rate": 4.835400398101363e-06, "loss": 0.6102, "step": 430 }, { "epoch": 0.06583984075945491, "eval_accuracy": 0.5721997903049502, "eval_loss": 0.7291567325592041, "eval_runtime": 277.8978, "eval_samples_per_second": 162.463, "eval_steps_per_second": 20.31, "step": 430 }, { "epoch": 0.06737099984688409, "grad_norm": 12.54729175567627, "learning_rate": 4.8315725003827905e-06, "loss": 0.7269, "step": 440 }, { "epoch": 0.06737099984688409, "eval_accuracy": 0.5739306990338918, "eval_loss": 0.7139677405357361, "eval_runtime": 278.0873, "eval_samples_per_second": 162.352, "eval_steps_per_second": 20.296, "step": 440 }, { "epoch": 0.06890215893431327, "grad_norm": 7.917787075042725, "learning_rate": 4.827744602664217e-06, "loss": 0.625, "step": 450 }, { "epoch": 0.06890215893431327, "eval_accuracy": 0.5735848215281029, "eval_loss": 0.7001749873161316, "eval_runtime": 274.8459, "eval_samples_per_second": 164.267, "eval_steps_per_second": 20.535, "step": 450 }, { "epoch": 0.07043331802174246, "grad_norm": 9.970056533813477, "learning_rate": 4.823916704945645e-06, "loss": 0.6168, "step": 460 }, { "epoch": 0.07043331802174246, "eval_accuracy": 0.5706444127097465, "eval_loss": 0.7068008184432983, "eval_runtime": 275.5547, "eval_samples_per_second": 163.844, "eval_steps_per_second": 20.482, "step": 460 }, { "epoch": 0.07196447710917164, "grad_norm": 14.022720336914062, "learning_rate": 4.820088807227071e-06, "loss": 0.5978, "step": 470 }, { "epoch": 0.07196447710917164, "eval_accuracy": 0.5733915328597199, "eval_loss": 0.7220426797866821, "eval_runtime": 276.0799, "eval_samples_per_second": 163.532, "eval_steps_per_second": 20.443, "step": 470 }, { "epoch": 0.07349563619660082, "grad_norm": 15.758445739746094, "learning_rate": 4.816260909508498e-06, "loss": 0.6583, "step": 480 }, { "epoch": 0.07349563619660082, "eval_accuracy": 0.5795259437643544, "eval_loss": 0.7102298736572266, "eval_runtime": 277.2435, "eval_samples_per_second": 162.846, "eval_steps_per_second": 20.358, "step": 480 }, { "epoch": 0.07502679528403002, "grad_norm": 9.483732223510742, "learning_rate": 4.8124330117899254e-06, "loss": 0.6455, "step": 490 }, { "epoch": 0.07502679528403002, "eval_accuracy": 0.5872728491919802, "eval_loss": 0.6983802318572998, "eval_runtime": 277.7989, "eval_samples_per_second": 162.52, "eval_steps_per_second": 20.317, "step": 490 }, { "epoch": 0.0765579543714592, "grad_norm": 11.97518539428711, "learning_rate": 4.808605114071352e-06, "loss": 0.6796, "step": 500 }, { "epoch": 0.0765579543714592, "eval_accuracy": 0.5854998659876709, "eval_loss": 0.7021452188491821, "eval_runtime": 278.7441, "eval_samples_per_second": 161.969, "eval_steps_per_second": 20.248, "step": 500 }, { "epoch": 0.07808911345888837, "grad_norm": 10.056512832641602, "learning_rate": 4.8047772163527796e-06, "loss": 0.6509, "step": 510 }, { "epoch": 0.07808911345888837, "eval_accuracy": 0.57778125558934, "eval_loss": 0.7093414664268494, "eval_runtime": 286.7453, "eval_samples_per_second": 157.45, "eval_steps_per_second": 19.683, "step": 510 }, { "epoch": 0.07962027254631757, "grad_norm": 11.229554176330566, "learning_rate": 4.800949318634206e-06, "loss": 0.5777, "step": 520 }, { "epoch": 0.07962027254631757, "eval_accuracy": 0.5840340820377846, "eval_loss": 0.7045831084251404, "eval_runtime": 284.2635, "eval_samples_per_second": 158.824, "eval_steps_per_second": 19.855, "step": 520 }, { "epoch": 0.08115143163374675, "grad_norm": 16.119789123535156, "learning_rate": 4.797121420915634e-06, "loss": 0.6145, "step": 530 }, { "epoch": 0.08115143163374675, "eval_accuracy": 0.5854364178573018, "eval_loss": 0.7088597416877747, "eval_runtime": 279.337, "eval_samples_per_second": 161.626, "eval_steps_per_second": 20.205, "step": 530 }, { "epoch": 0.08268259072117592, "grad_norm": 14.363024711608887, "learning_rate": 4.79329352319706e-06, "loss": 0.6973, "step": 540 }, { "epoch": 0.08268259072117592, "eval_accuracy": 0.5904418635696169, "eval_loss": 0.6892778277397156, "eval_runtime": 279.0651, "eval_samples_per_second": 161.783, "eval_steps_per_second": 20.225, "step": 540 }, { "epoch": 0.08421374980860512, "grad_norm": 12.421643257141113, "learning_rate": 4.789465625478488e-06, "loss": 0.6444, "step": 550 }, { "epoch": 0.08421374980860512, "eval_accuracy": 0.5964966878584449, "eval_loss": 0.6805678009986877, "eval_runtime": 277.6804, "eval_samples_per_second": 162.59, "eval_steps_per_second": 20.326, "step": 550 }, { "epoch": 0.0857449088960343, "grad_norm": 13.641290664672852, "learning_rate": 4.7856377277599145e-06, "loss": 0.6197, "step": 560 }, { "epoch": 0.0857449088960343, "eval_accuracy": 0.6008450077829665, "eval_loss": 0.6836313605308533, "eval_runtime": 278.3481, "eval_samples_per_second": 162.2, "eval_steps_per_second": 20.277, "step": 560 }, { "epoch": 0.08727606798346348, "grad_norm": 11.80357837677002, "learning_rate": 4.781809830041342e-06, "loss": 0.6241, "step": 570 }, { "epoch": 0.08727606798346348, "eval_accuracy": 0.601209668453003, "eval_loss": 0.6760628819465637, "eval_runtime": 279.0499, "eval_samples_per_second": 161.792, "eval_steps_per_second": 20.226, "step": 570 }, { "epoch": 0.08880722707089267, "grad_norm": 10.733393669128418, "learning_rate": 4.777981932322769e-06, "loss": 0.713, "step": 580 }, { "epoch": 0.08880722707089267, "eval_accuracy": 0.5967536955697755, "eval_loss": 0.6692460775375366, "eval_runtime": 279.8616, "eval_samples_per_second": 161.323, "eval_steps_per_second": 20.167, "step": 580 }, { "epoch": 0.09033838615832185, "grad_norm": 11.116392135620117, "learning_rate": 4.774154034604196e-06, "loss": 0.6109, "step": 590 }, { "epoch": 0.09033838615832185, "eval_accuracy": 0.5920902946621758, "eval_loss": 0.674372673034668, "eval_runtime": 276.8783, "eval_samples_per_second": 163.061, "eval_steps_per_second": 20.384, "step": 590 }, { "epoch": 0.09186954524575103, "grad_norm": 9.64384937286377, "learning_rate": 4.770326136885623e-06, "loss": 0.6704, "step": 600 }, { "epoch": 0.09186954524575103, "eval_accuracy": 0.586066763425254, "eval_loss": 0.6863875389099121, "eval_runtime": 279.7067, "eval_samples_per_second": 161.412, "eval_steps_per_second": 20.178, "step": 600 }, { "epoch": 0.09340070433318022, "grad_norm": 13.213354110717773, "learning_rate": 4.76649823916705e-06, "loss": 0.6605, "step": 610 }, { "epoch": 0.09340070433318022, "eval_accuracy": 0.5854859919317092, "eval_loss": 0.6997817158699036, "eval_runtime": 278.2489, "eval_samples_per_second": 162.258, "eval_steps_per_second": 20.284, "step": 610 }, { "epoch": 0.0949318634206094, "grad_norm": 8.854043006896973, "learning_rate": 4.762670341448477e-06, "loss": 0.6467, "step": 620 }, { "epoch": 0.0949318634206094, "eval_accuracy": 0.5874072750022343, "eval_loss": 0.6911128759384155, "eval_runtime": 279.0402, "eval_samples_per_second": 161.797, "eval_steps_per_second": 20.226, "step": 620 }, { "epoch": 0.09646302250803858, "grad_norm": 8.668028831481934, "learning_rate": 4.758842443729904e-06, "loss": 0.653, "step": 630 }, { "epoch": 0.09646302250803858, "eval_accuracy": 0.5896617883276816, "eval_loss": 0.683178186416626, "eval_runtime": 278.9227, "eval_samples_per_second": 161.866, "eval_steps_per_second": 20.235, "step": 630 }, { "epoch": 0.09799418159546777, "grad_norm": 8.654230117797852, "learning_rate": 4.755014546011331e-06, "loss": 0.6292, "step": 640 }, { "epoch": 0.09799418159546777, "eval_accuracy": 0.5951244535641003, "eval_loss": 0.6820477843284607, "eval_runtime": 279.422, "eval_samples_per_second": 161.576, "eval_steps_per_second": 20.199, "step": 640 }, { "epoch": 0.09952534068289695, "grad_norm": 12.051166534423828, "learning_rate": 4.7511866482927585e-06, "loss": 0.6319, "step": 650 }, { "epoch": 0.09952534068289695, "eval_accuracy": 0.5965489637996976, "eval_loss": 0.6887350678443909, "eval_runtime": 279.2633, "eval_samples_per_second": 161.668, "eval_steps_per_second": 20.21, "step": 650 }, { "epoch": 0.10105649977032613, "grad_norm": 10.261021614074707, "learning_rate": 4.747358750574185e-06, "loss": 0.687, "step": 660 }, { "epoch": 0.10105649977032613, "eval_accuracy": 0.5923518675154699, "eval_loss": 0.6835098266601562, "eval_runtime": 279.8292, "eval_samples_per_second": 161.341, "eval_steps_per_second": 20.169, "step": 660 }, { "epoch": 0.10258765885775532, "grad_norm": 9.035223007202148, "learning_rate": 4.743530852855613e-06, "loss": 0.6705, "step": 670 }, { "epoch": 0.10258765885775532, "eval_accuracy": 0.5858986422906305, "eval_loss": 0.6886019706726074, "eval_runtime": 279.2751, "eval_samples_per_second": 161.661, "eval_steps_per_second": 20.209, "step": 670 }, { "epoch": 0.1041188179451845, "grad_norm": 7.328871726989746, "learning_rate": 4.739702955137039e-06, "loss": 0.565, "step": 680 }, { "epoch": 0.1041188179451845, "eval_accuracy": 0.5869302949061662, "eval_loss": 0.7063195109367371, "eval_runtime": 279.7163, "eval_samples_per_second": 161.406, "eval_steps_per_second": 20.178, "step": 680 }, { "epoch": 0.10564997703261368, "grad_norm": 21.045848846435547, "learning_rate": 4.735875057418467e-06, "loss": 0.6541, "step": 690 }, { "epoch": 0.10564997703261368, "eval_accuracy": 0.5889380826306538, "eval_loss": 0.7606213092803955, "eval_runtime": 280.23, "eval_samples_per_second": 161.111, "eval_steps_per_second": 20.141, "step": 690 }, { "epoch": 0.10718113612004287, "grad_norm": 12.267477989196777, "learning_rate": 4.732047159699893e-06, "loss": 0.7604, "step": 700 }, { "epoch": 0.10718113612004287, "eval_accuracy": 0.5878412959789937, "eval_loss": 0.7354863882064819, "eval_runtime": 280.1519, "eval_samples_per_second": 161.155, "eval_steps_per_second": 20.146, "step": 700 }, { "epoch": 0.10871229520747205, "grad_norm": 8.619697570800781, "learning_rate": 4.72821926198132e-06, "loss": 0.6401, "step": 710 }, { "epoch": 0.10871229520747205, "eval_accuracy": 0.5879179670084708, "eval_loss": 0.699480414390564, "eval_runtime": 282.5216, "eval_samples_per_second": 159.804, "eval_steps_per_second": 19.977, "step": 710 }, { "epoch": 0.11024345429490125, "grad_norm": 6.322849750518799, "learning_rate": 4.724391364262747e-06, "loss": 0.6129, "step": 720 }, { "epoch": 0.11024345429490125, "eval_accuracy": 0.5929760364139408, "eval_loss": 0.688232958316803, "eval_runtime": 278.9404, "eval_samples_per_second": 161.855, "eval_steps_per_second": 20.234, "step": 720 }, { "epoch": 0.11177461338233043, "grad_norm": 9.213967323303223, "learning_rate": 4.720563466544174e-06, "loss": 0.6502, "step": 730 }, { "epoch": 0.11177461338233043, "eval_accuracy": 0.5926983206583555, "eval_loss": 0.6913579702377319, "eval_runtime": 277.6463, "eval_samples_per_second": 162.61, "eval_steps_per_second": 20.328, "step": 730 }, { "epoch": 0.1133057724697596, "grad_norm": 7.4615349769592285, "learning_rate": 4.716735568825601e-06, "loss": 0.6199, "step": 740 }, { "epoch": 0.1133057724697596, "eval_accuracy": 0.5917493589028877, "eval_loss": 0.6992406845092773, "eval_runtime": 278.7175, "eval_samples_per_second": 161.985, "eval_steps_per_second": 20.25, "step": 740 }, { "epoch": 0.1148369315571888, "grad_norm": 11.835037231445312, "learning_rate": 4.712907671107028e-06, "loss": 0.5761, "step": 750 }, { "epoch": 0.1148369315571888, "eval_accuracy": 0.5893469260561813, "eval_loss": 0.7284606099128723, "eval_runtime": 279.7689, "eval_samples_per_second": 161.376, "eval_steps_per_second": 20.174, "step": 750 }, { "epoch": 0.11636809064461798, "grad_norm": 9.900166511535645, "learning_rate": 4.709079773388455e-06, "loss": 0.6017, "step": 760 }, { "epoch": 0.11636809064461798, "eval_accuracy": 0.5889316629208483, "eval_loss": 0.7434907555580139, "eval_runtime": 281.7435, "eval_samples_per_second": 160.245, "eval_steps_per_second": 20.032, "step": 760 }, { "epoch": 0.11789924973204716, "grad_norm": 12.274435997009277, "learning_rate": 4.7052518756698825e-06, "loss": 0.5757, "step": 770 }, { "epoch": 0.11789924973204716, "eval_accuracy": 0.5897960545337277, "eval_loss": 0.7581047415733337, "eval_runtime": 279.4034, "eval_samples_per_second": 161.587, "eval_steps_per_second": 20.2, "step": 770 }, { "epoch": 0.11943040881947635, "grad_norm": 10.7369384765625, "learning_rate": 4.701423977951309e-06, "loss": 0.6231, "step": 780 }, { "epoch": 0.11943040881947635, "eval_accuracy": 0.5952973044984236, "eval_loss": 0.7496009469032288, "eval_runtime": 279.5031, "eval_samples_per_second": 161.529, "eval_steps_per_second": 20.193, "step": 780 }, { "epoch": 0.12096156790690553, "grad_norm": 11.4940824508667, "learning_rate": 4.697596080232737e-06, "loss": 0.6995, "step": 790 }, { "epoch": 0.12096156790690553, "eval_accuracy": 0.5959564541213064, "eval_loss": 0.7335057258605957, "eval_runtime": 279.459, "eval_samples_per_second": 161.555, "eval_steps_per_second": 20.196, "step": 790 }, { "epoch": 0.12249272699433471, "grad_norm": 8.03961181640625, "learning_rate": 4.693768182514163e-06, "loss": 0.6434, "step": 800 }, { "epoch": 0.12249272699433471, "eval_accuracy": 0.5859463796215819, "eval_loss": 0.728286623954773, "eval_runtime": 280.7002, "eval_samples_per_second": 160.841, "eval_steps_per_second": 20.107, "step": 800 }, { "epoch": 0.1240238860817639, "grad_norm": 8.879143714904785, "learning_rate": 4.689940284795591e-06, "loss": 0.7005, "step": 810 }, { "epoch": 0.1240238860817639, "eval_accuracy": 0.5771607003457121, "eval_loss": 0.7147245407104492, "eval_runtime": 276.8558, "eval_samples_per_second": 163.074, "eval_steps_per_second": 20.386, "step": 810 }, { "epoch": 0.12555504516919308, "grad_norm": 7.983788967132568, "learning_rate": 4.686112387077017e-06, "loss": 0.6639, "step": 820 }, { "epoch": 0.12555504516919308, "eval_accuracy": 0.5777222309014216, "eval_loss": 0.6992844939231873, "eval_runtime": 278.3704, "eval_samples_per_second": 162.187, "eval_steps_per_second": 20.275, "step": 820 }, { "epoch": 0.12708620425662226, "grad_norm": 6.068845748901367, "learning_rate": 4.682284489358445e-06, "loss": 0.6211, "step": 830 }, { "epoch": 0.12708620425662226, "eval_accuracy": 0.5828836462560764, "eval_loss": 0.689354419708252, "eval_runtime": 278.7274, "eval_samples_per_second": 161.979, "eval_steps_per_second": 20.249, "step": 830 }, { "epoch": 0.12861736334405144, "grad_norm": 7.345738887786865, "learning_rate": 4.6784565916398715e-06, "loss": 0.6456, "step": 840 }, { "epoch": 0.12861736334405144, "eval_accuracy": 0.5853175045103236, "eval_loss": 0.6859722137451172, "eval_runtime": 279.9849, "eval_samples_per_second": 161.252, "eval_steps_per_second": 20.158, "step": 840 }, { "epoch": 0.13014852243148062, "grad_norm": 11.570878028869629, "learning_rate": 4.674628693921299e-06, "loss": 0.6255, "step": 850 }, { "epoch": 0.13014852243148062, "eval_accuracy": 0.5951967978652435, "eval_loss": 0.6828535199165344, "eval_runtime": 281.2947, "eval_samples_per_second": 160.501, "eval_steps_per_second": 20.064, "step": 850 }, { "epoch": 0.13167968151890982, "grad_norm": 10.029556274414062, "learning_rate": 4.670800796202726e-06, "loss": 0.5931, "step": 860 }, { "epoch": 0.13167968151890982, "eval_accuracy": 0.5980770938804512, "eval_loss": 0.6795242428779602, "eval_runtime": 280.5479, "eval_samples_per_second": 160.928, "eval_steps_per_second": 20.118, "step": 860 }, { "epoch": 0.133210840606339, "grad_norm": 16.333703994750977, "learning_rate": 4.666972898484153e-06, "loss": 0.7352, "step": 870 }, { "epoch": 0.133210840606339, "eval_accuracy": 0.5991164979577339, "eval_loss": 0.6769992709159851, "eval_runtime": 280.0752, "eval_samples_per_second": 161.2, "eval_steps_per_second": 20.152, "step": 870 }, { "epoch": 0.13474199969376818, "grad_norm": 6.884426593780518, "learning_rate": 4.66314500076558e-06, "loss": 0.6425, "step": 880 }, { "epoch": 0.13474199969376818, "eval_accuracy": 0.598591236334548, "eval_loss": 0.6669920086860657, "eval_runtime": 279.9453, "eval_samples_per_second": 161.274, "eval_steps_per_second": 20.161, "step": 880 }, { "epoch": 0.13627315878119736, "grad_norm": 9.960312843322754, "learning_rate": 4.659317103047007e-06, "loss": 0.6905, "step": 890 }, { "epoch": 0.13627315878119736, "eval_accuracy": 0.5981738203145828, "eval_loss": 0.6663030385971069, "eval_runtime": 279.6741, "eval_samples_per_second": 161.431, "eval_steps_per_second": 20.181, "step": 890 }, { "epoch": 0.13780431786862654, "grad_norm": 7.798278331756592, "learning_rate": 4.655489205328434e-06, "loss": 0.5681, "step": 900 }, { "epoch": 0.13780431786862654, "eval_accuracy": 0.6002266515565629, "eval_loss": 0.6640587449073792, "eval_runtime": 279.9048, "eval_samples_per_second": 161.298, "eval_steps_per_second": 20.164, "step": 900 }, { "epoch": 0.13933547695605572, "grad_norm": 10.989811897277832, "learning_rate": 4.651661307609861e-06, "loss": 0.5809, "step": 910 }, { "epoch": 0.13933547695605572, "eval_accuracy": 0.599174562318326, "eval_loss": 0.6831759810447693, "eval_runtime": 278.8518, "eval_samples_per_second": 161.907, "eval_steps_per_second": 20.24, "step": 910 }, { "epoch": 0.14086663604348493, "grad_norm": 13.403684616088867, "learning_rate": 4.647833409891288e-06, "loss": 0.5984, "step": 920 }, { "epoch": 0.14086663604348493, "eval_accuracy": 0.5973414996782282, "eval_loss": 0.7047386765480042, "eval_runtime": 277.7644, "eval_samples_per_second": 162.541, "eval_steps_per_second": 20.319, "step": 920 }, { "epoch": 0.1423977951309141, "grad_norm": 11.702314376831055, "learning_rate": 4.6440055121727155e-06, "loss": 0.631, "step": 930 }, { "epoch": 0.1423977951309141, "eval_accuracy": 0.5949097681018046, "eval_loss": 0.7055184841156006, "eval_runtime": 277.5329, "eval_samples_per_second": 162.676, "eval_steps_per_second": 20.336, "step": 930 }, { "epoch": 0.14392895421834329, "grad_norm": 10.92357063293457, "learning_rate": 4.640177614454142e-06, "loss": 0.6703, "step": 940 }, { "epoch": 0.14392895421834329, "eval_accuracy": 0.5971907868459593, "eval_loss": 0.684637188911438, "eval_runtime": 276.7646, "eval_samples_per_second": 163.128, "eval_steps_per_second": 20.393, "step": 940 }, { "epoch": 0.14546011330577246, "grad_norm": 9.321954727172852, "learning_rate": 4.636349716735569e-06, "loss": 0.6304, "step": 950 }, { "epoch": 0.14546011330577246, "eval_accuracy": 0.5969360568383659, "eval_loss": 0.6831667423248291, "eval_runtime": 276.3791, "eval_samples_per_second": 163.355, "eval_steps_per_second": 20.421, "step": 950 }, { "epoch": 0.14699127239320164, "grad_norm": 11.221244812011719, "learning_rate": 4.632521819016996e-06, "loss": 0.6373, "step": 960 }, { "epoch": 0.14699127239320164, "eval_accuracy": 0.5956472445145944, "eval_loss": 0.6867417097091675, "eval_runtime": 275.868, "eval_samples_per_second": 163.658, "eval_steps_per_second": 20.459, "step": 960 }, { "epoch": 0.14852243148063085, "grad_norm": 13.402386665344238, "learning_rate": 4.628693921298423e-06, "loss": 0.6338, "step": 970 }, { "epoch": 0.14852243148063085, "eval_accuracy": 0.595343976519767, "eval_loss": 0.6871860027313232, "eval_runtime": 276.6694, "eval_samples_per_second": 163.184, "eval_steps_per_second": 20.4, "step": 970 }, { "epoch": 0.15005359056806003, "grad_norm": 6.8687520027160645, "learning_rate": 4.6248660235798504e-06, "loss": 0.6541, "step": 980 }, { "epoch": 0.15005359056806003, "eval_accuracy": 0.5944526067405725, "eval_loss": 0.6828967332839966, "eval_runtime": 277.3111, "eval_samples_per_second": 162.806, "eval_steps_per_second": 20.353, "step": 980 }, { "epoch": 0.1515847496554892, "grad_norm": 8.383277893066406, "learning_rate": 4.621038125861277e-06, "loss": 0.6485, "step": 990 }, { "epoch": 0.1515847496554892, "eval_accuracy": 0.5881514159035716, "eval_loss": 0.6898565292358398, "eval_runtime": 278.3748, "eval_samples_per_second": 162.184, "eval_steps_per_second": 20.275, "step": 990 }, { "epoch": 0.1531159087429184, "grad_norm": 8.281054496765137, "learning_rate": 4.617210228142705e-06, "loss": 0.5877, "step": 1000 }, { "epoch": 0.1531159087429184, "eval_accuracy": 0.5914452307829261, "eval_loss": 0.6997293829917908, "eval_runtime": 277.7962, "eval_samples_per_second": 162.522, "eval_steps_per_second": 20.317, "step": 1000 }, { "epoch": 0.15464706783034757, "grad_norm": 10.8377685546875, "learning_rate": 4.613382330424131e-06, "loss": 0.6585, "step": 1010 }, { "epoch": 0.15464706783034757, "eval_accuracy": 0.5923300819872465, "eval_loss": 0.6947582364082336, "eval_runtime": 278.9872, "eval_samples_per_second": 161.828, "eval_steps_per_second": 20.23, "step": 1010 }, { "epoch": 0.15617822691777675, "grad_norm": 12.618541717529297, "learning_rate": 4.609554432705559e-06, "loss": 0.6153, "step": 1020 }, { "epoch": 0.15617822691777675, "eval_accuracy": 0.5965337184757249, "eval_loss": 0.6904256939888, "eval_runtime": 278.6853, "eval_samples_per_second": 162.004, "eval_steps_per_second": 20.252, "step": 1020 }, { "epoch": 0.15770938600520595, "grad_norm": 15.610793113708496, "learning_rate": 4.605726534986985e-06, "loss": 0.6145, "step": 1030 }, { "epoch": 0.15770938600520595, "eval_accuracy": 0.5957805907172996, "eval_loss": 0.7072130441665649, "eval_runtime": 278.4706, "eval_samples_per_second": 162.128, "eval_steps_per_second": 20.268, "step": 1030 }, { "epoch": 0.15924054509263513, "grad_norm": 10.127962112426758, "learning_rate": 4.601898637268413e-06, "loss": 0.6019, "step": 1040 }, { "epoch": 0.15924054509263513, "eval_accuracy": 0.5954627183733269, "eval_loss": 0.6940288543701172, "eval_runtime": 278.8001, "eval_samples_per_second": 161.937, "eval_steps_per_second": 20.244, "step": 1040 }, { "epoch": 0.1607717041800643, "grad_norm": 18.335458755493164, "learning_rate": 4.5980707395498395e-06, "loss": 0.5354, "step": 1050 }, { "epoch": 0.1607717041800643, "eval_accuracy": 0.5993520757982559, "eval_loss": 0.7147676348686218, "eval_runtime": 278.6112, "eval_samples_per_second": 162.047, "eval_steps_per_second": 20.258, "step": 1050 }, { "epoch": 0.1623028632674935, "grad_norm": 13.370587348937988, "learning_rate": 4.594242841831267e-06, "loss": 0.6977, "step": 1060 }, { "epoch": 0.1623028632674935, "eval_accuracy": 0.5989220600629908, "eval_loss": 0.7047263979911804, "eval_runtime": 279.4512, "eval_samples_per_second": 161.56, "eval_steps_per_second": 20.197, "step": 1060 }, { "epoch": 0.16383402235492267, "grad_norm": 9.09716510772705, "learning_rate": 4.590414944112694e-06, "loss": 0.6039, "step": 1070 }, { "epoch": 0.16383402235492267, "eval_accuracy": 0.5984739258700619, "eval_loss": 0.6938444972038269, "eval_runtime": 280.6964, "eval_samples_per_second": 160.843, "eval_steps_per_second": 20.107, "step": 1070 }, { "epoch": 0.16536518144235185, "grad_norm": 11.401485443115234, "learning_rate": 4.586587046394121e-06, "loss": 0.6579, "step": 1080 }, { "epoch": 0.16536518144235185, "eval_accuracy": 0.5967512870584059, "eval_loss": 0.6896911263465881, "eval_runtime": 279.247, "eval_samples_per_second": 161.678, "eval_steps_per_second": 20.212, "step": 1080 }, { "epoch": 0.16689634052978106, "grad_norm": 10.442956924438477, "learning_rate": 4.582759148675548e-06, "loss": 0.5409, "step": 1090 }, { "epoch": 0.16689634052978106, "eval_accuracy": 0.5923961292613636, "eval_loss": 0.7205661535263062, "eval_runtime": 278.5362, "eval_samples_per_second": 162.09, "eval_steps_per_second": 20.263, "step": 1090 }, { "epoch": 0.16842749961721024, "grad_norm": 24.116500854492188, "learning_rate": 4.578931250956975e-06, "loss": 0.5717, "step": 1100 }, { "epoch": 0.16842749961721024, "eval_accuracy": 0.5918675367336973, "eval_loss": 0.7739020586013794, "eval_runtime": 277.7642, "eval_samples_per_second": 162.541, "eval_steps_per_second": 20.319, "step": 1100 }, { "epoch": 0.16995865870463941, "grad_norm": 17.19237518310547, "learning_rate": 4.575103353238402e-06, "loss": 0.7444, "step": 1110 }, { "epoch": 0.16995865870463941, "eval_accuracy": 0.5971056953877569, "eval_loss": 0.7259252667427063, "eval_runtime": 280.0491, "eval_samples_per_second": 161.215, "eval_steps_per_second": 20.154, "step": 1110 }, { "epoch": 0.1714898177920686, "grad_norm": 12.191926002502441, "learning_rate": 4.571275455519829e-06, "loss": 0.5495, "step": 1120 }, { "epoch": 0.1714898177920686, "eval_accuracy": 0.5972413486739816, "eval_loss": 0.7175703644752502, "eval_runtime": 280.1557, "eval_samples_per_second": 161.153, "eval_steps_per_second": 20.146, "step": 1120 }, { "epoch": 0.17302097687949777, "grad_norm": 18.153154373168945, "learning_rate": 4.567447557801256e-06, "loss": 0.6002, "step": 1130 }, { "epoch": 0.17302097687949777, "eval_accuracy": 0.5982025962498613, "eval_loss": 0.7397978901863098, "eval_runtime": 280.0206, "eval_samples_per_second": 161.231, "eval_steps_per_second": 20.156, "step": 1130 }, { "epoch": 0.17455213596692695, "grad_norm": 9.707260131835938, "learning_rate": 4.5636196600826835e-06, "loss": 0.648, "step": 1140 }, { "epoch": 0.17455213596692695, "eval_accuracy": 0.5982077501497238, "eval_loss": 0.7219535708427429, "eval_runtime": 279.1283, "eval_samples_per_second": 161.746, "eval_steps_per_second": 20.22, "step": 1140 }, { "epoch": 0.17608329505435616, "grad_norm": 13.713787078857422, "learning_rate": 4.55979176236411e-06, "loss": 0.7169, "step": 1150 }, { "epoch": 0.17608329505435616, "eval_accuracy": 0.5967283703999645, "eval_loss": 0.7080119848251343, "eval_runtime": 280.0417, "eval_samples_per_second": 161.219, "eval_steps_per_second": 20.154, "step": 1150 }, { "epoch": 0.17761445414178534, "grad_norm": 12.010796546936035, "learning_rate": 4.555963864645538e-06, "loss": 0.6007, "step": 1160 }, { "epoch": 0.17761445414178534, "eval_accuracy": 0.593027131524565, "eval_loss": 0.686759889125824, "eval_runtime": 277.4036, "eval_samples_per_second": 162.752, "eval_steps_per_second": 20.346, "step": 1160 }, { "epoch": 0.17914561322921452, "grad_norm": 11.684185028076172, "learning_rate": 4.552135966926964e-06, "loss": 0.5699, "step": 1170 }, { "epoch": 0.17914561322921452, "eval_accuracy": 0.589081225033289, "eval_loss": 0.6952749490737915, "eval_runtime": 278.728, "eval_samples_per_second": 161.979, "eval_steps_per_second": 20.249, "step": 1170 }, { "epoch": 0.1806767723166437, "grad_norm": 14.61754035949707, "learning_rate": 4.548308069208391e-06, "loss": 0.6718, "step": 1180 }, { "epoch": 0.1806767723166437, "eval_accuracy": 0.5850660157550205, "eval_loss": 0.7031010985374451, "eval_runtime": 279.3758, "eval_samples_per_second": 161.603, "eval_steps_per_second": 20.202, "step": 1180 }, { "epoch": 0.18220793140407288, "grad_norm": 8.807073593139648, "learning_rate": 4.544480171489818e-06, "loss": 0.6719, "step": 1190 }, { "epoch": 0.18220793140407288, "eval_accuracy": 0.5842983840494343, "eval_loss": 0.6897585988044739, "eval_runtime": 278.4428, "eval_samples_per_second": 162.145, "eval_steps_per_second": 20.27, "step": 1190 }, { "epoch": 0.18373909049150206, "grad_norm": 8.141523361206055, "learning_rate": 4.540652273771245e-06, "loss": 0.6139, "step": 1200 }, { "epoch": 0.18373909049150206, "eval_accuracy": 0.5901683023224832, "eval_loss": 0.6856178045272827, "eval_runtime": 278.2363, "eval_samples_per_second": 162.265, "eval_steps_per_second": 20.285, "step": 1200 }, { "epoch": 0.18527024957893126, "grad_norm": 8.22572135925293, "learning_rate": 4.536824376052672e-06, "loss": 0.6554, "step": 1210 }, { "epoch": 0.18527024957893126, "eval_accuracy": 0.5936903334665423, "eval_loss": 0.6899842619895935, "eval_runtime": 279.6454, "eval_samples_per_second": 161.447, "eval_steps_per_second": 20.183, "step": 1210 }, { "epoch": 0.18680140866636044, "grad_norm": 10.63383674621582, "learning_rate": 4.532996478334099e-06, "loss": 0.5281, "step": 1220 }, { "epoch": 0.18680140866636044, "eval_accuracy": 0.5959264271926515, "eval_loss": 0.7073134183883667, "eval_runtime": 280.0671, "eval_samples_per_second": 161.204, "eval_steps_per_second": 20.152, "step": 1220 }, { "epoch": 0.18833256775378962, "grad_norm": 17.710468292236328, "learning_rate": 4.529168580615526e-06, "loss": 0.6106, "step": 1230 }, { "epoch": 0.18833256775378962, "eval_accuracy": 0.5961982077899033, "eval_loss": 0.7480549812316895, "eval_runtime": 279.3117, "eval_samples_per_second": 161.64, "eval_steps_per_second": 20.207, "step": 1230 }, { "epoch": 0.1898637268412188, "grad_norm": 19.713132858276367, "learning_rate": 4.525340682896953e-06, "loss": 0.6344, "step": 1240 }, { "epoch": 0.1898637268412188, "eval_accuracy": 0.5923722417084758, "eval_loss": 0.7380005717277527, "eval_runtime": 279.4642, "eval_samples_per_second": 161.552, "eval_steps_per_second": 20.196, "step": 1240 }, { "epoch": 0.19139488592864798, "grad_norm": 17.92173957824707, "learning_rate": 4.52151278517838e-06, "loss": 0.5918, "step": 1250 }, { "epoch": 0.19139488592864798, "eval_accuracy": 0.5899982226961699, "eval_loss": 0.7242019772529602, "eval_runtime": 279.4636, "eval_samples_per_second": 161.552, "eval_steps_per_second": 20.196, "step": 1250 }, { "epoch": 0.19292604501607716, "grad_norm": 12.8328857421875, "learning_rate": 4.5176848874598075e-06, "loss": 0.6847, "step": 1260 }, { "epoch": 0.19292604501607716, "eval_accuracy": 0.5958907152376721, "eval_loss": 0.6952394843101501, "eval_runtime": 276.2146, "eval_samples_per_second": 163.453, "eval_steps_per_second": 20.433, "step": 1260 }, { "epoch": 0.19445720410350636, "grad_norm": 8.0042142868042, "learning_rate": 4.513856989741234e-06, "loss": 0.6312, "step": 1270 }, { "epoch": 0.19445720410350636, "eval_accuracy": 0.5953926887841323, "eval_loss": 0.6836423873901367, "eval_runtime": 279.4491, "eval_samples_per_second": 161.561, "eval_steps_per_second": 20.197, "step": 1270 }, { "epoch": 0.19598836319093554, "grad_norm": 10.186333656311035, "learning_rate": 4.510029092022662e-06, "loss": 0.6135, "step": 1280 }, { "epoch": 0.19598836319093554, "eval_accuracy": 0.5948660962329148, "eval_loss": 0.6878789067268372, "eval_runtime": 276.736, "eval_samples_per_second": 163.145, "eval_steps_per_second": 20.395, "step": 1280 }, { "epoch": 0.19751952227836472, "grad_norm": 9.086492538452148, "learning_rate": 4.506201194304088e-06, "loss": 0.6481, "step": 1290 }, { "epoch": 0.19751952227836472, "eval_accuracy": 0.5944503735325507, "eval_loss": 0.6821103692054749, "eval_runtime": 274.4173, "eval_samples_per_second": 164.523, "eval_steps_per_second": 20.567, "step": 1290 }, { "epoch": 0.1990506813657939, "grad_norm": 8.008011817932129, "learning_rate": 4.502373296585516e-06, "loss": 0.6022, "step": 1300 }, { "epoch": 0.1990506813657939, "eval_accuracy": 0.5919674427913804, "eval_loss": 0.6874357461929321, "eval_runtime": 273.5543, "eval_samples_per_second": 165.042, "eval_steps_per_second": 20.632, "step": 1300 }, { "epoch": 0.20058184045322308, "grad_norm": 9.115822792053223, "learning_rate": 4.498545398866942e-06, "loss": 0.5877, "step": 1310 }, { "epoch": 0.20058184045322308, "eval_accuracy": 0.5949752993012595, "eval_loss": 0.696998655796051, "eval_runtime": 277.1667, "eval_samples_per_second": 162.891, "eval_steps_per_second": 20.363, "step": 1310 }, { "epoch": 0.20211299954065226, "grad_norm": 14.700295448303223, "learning_rate": 4.49471750114837e-06, "loss": 0.6563, "step": 1320 }, { "epoch": 0.20211299954065226, "eval_accuracy": 0.5916631504141775, "eval_loss": 0.7209578156471252, "eval_runtime": 277.3197, "eval_samples_per_second": 162.801, "eval_steps_per_second": 20.352, "step": 1320 }, { "epoch": 0.20364415862808147, "grad_norm": 12.265641212463379, "learning_rate": 4.4908896034297965e-06, "loss": 0.6844, "step": 1330 }, { "epoch": 0.20364415862808147, "eval_accuracy": 0.5921450151057401, "eval_loss": 0.7018990516662598, "eval_runtime": 275.9386, "eval_samples_per_second": 163.616, "eval_steps_per_second": 20.454, "step": 1330 }, { "epoch": 0.20517531771551065, "grad_norm": 14.695290565490723, "learning_rate": 4.487061705711224e-06, "loss": 0.6242, "step": 1340 }, { "epoch": 0.20517531771551065, "eval_accuracy": 0.5874938869870626, "eval_loss": 0.7059697508811951, "eval_runtime": 275.3787, "eval_samples_per_second": 163.949, "eval_steps_per_second": 20.495, "step": 1340 }, { "epoch": 0.20670647680293983, "grad_norm": 17.197837829589844, "learning_rate": 4.483233807992651e-06, "loss": 0.6532, "step": 1350 }, { "epoch": 0.20670647680293983, "eval_accuracy": 0.5832462130480237, "eval_loss": 0.7054564952850342, "eval_runtime": 278.9798, "eval_samples_per_second": 161.833, "eval_steps_per_second": 20.231, "step": 1350 }, { "epoch": 0.208237635890369, "grad_norm": 10.455153465270996, "learning_rate": 4.479405910274078e-06, "loss": 0.6235, "step": 1360 }, { "epoch": 0.208237635890369, "eval_accuracy": 0.5825216811207472, "eval_loss": 0.7006902098655701, "eval_runtime": 278.2863, "eval_samples_per_second": 162.236, "eval_steps_per_second": 20.281, "step": 1360 }, { "epoch": 0.20976879497779818, "grad_norm": 11.930909156799316, "learning_rate": 4.475578012555505e-06, "loss": 0.5851, "step": 1370 }, { "epoch": 0.20976879497779818, "eval_accuracy": 0.5853447126283504, "eval_loss": 0.7129948139190674, "eval_runtime": 277.2023, "eval_samples_per_second": 162.87, "eval_steps_per_second": 20.361, "step": 1370 }, { "epoch": 0.21129995406522736, "grad_norm": 10.416621208190918, "learning_rate": 4.471750114836932e-06, "loss": 0.6387, "step": 1380 }, { "epoch": 0.21129995406522736, "eval_accuracy": 0.5875080603917906, "eval_loss": 0.7203475832939148, "eval_runtime": 276.7485, "eval_samples_per_second": 163.137, "eval_steps_per_second": 20.394, "step": 1380 }, { "epoch": 0.21283111315265657, "grad_norm": 14.316187858581543, "learning_rate": 4.467922217118359e-06, "loss": 0.5589, "step": 1390 }, { "epoch": 0.21283111315265657, "eval_accuracy": 0.5918902114052229, "eval_loss": 0.7276438474655151, "eval_runtime": 277.5697, "eval_samples_per_second": 162.655, "eval_steps_per_second": 20.334, "step": 1390 }, { "epoch": 0.21436227224008575, "grad_norm": 11.353260040283203, "learning_rate": 4.4640943193997856e-06, "loss": 0.5305, "step": 1400 }, { "epoch": 0.21436227224008575, "eval_accuracy": 0.5941167335891918, "eval_loss": 0.7376775145530701, "eval_runtime": 277.7475, "eval_samples_per_second": 162.551, "eval_steps_per_second": 20.321, "step": 1400 }, { "epoch": 0.21589343132751493, "grad_norm": 25.445398330688477, "learning_rate": 4.460266421681213e-06, "loss": 0.6585, "step": 1410 }, { "epoch": 0.21589343132751493, "eval_accuracy": 0.5962537174308669, "eval_loss": 0.7421597242355347, "eval_runtime": 276.6197, "eval_samples_per_second": 163.213, "eval_steps_per_second": 20.403, "step": 1410 }, { "epoch": 0.2174245904149441, "grad_norm": 12.295394897460938, "learning_rate": 4.45643852396264e-06, "loss": 0.6483, "step": 1420 }, { "epoch": 0.2174245904149441, "eval_accuracy": 0.5987611837577426, "eval_loss": 0.6953349709510803, "eval_runtime": 278.2037, "eval_samples_per_second": 162.284, "eval_steps_per_second": 20.287, "step": 1420 }, { "epoch": 0.2189557495023733, "grad_norm": 11.241786003112793, "learning_rate": 4.452610626244067e-06, "loss": 0.5395, "step": 1430 }, { "epoch": 0.2189557495023733, "eval_accuracy": 0.5976324790121263, "eval_loss": 0.6999543309211731, "eval_runtime": 279.3688, "eval_samples_per_second": 161.607, "eval_steps_per_second": 20.203, "step": 1430 }, { "epoch": 0.2204869085898025, "grad_norm": 14.92603874206543, "learning_rate": 4.448782728525494e-06, "loss": 0.619, "step": 1440 }, { "epoch": 0.2204869085898025, "eval_accuracy": 0.5938706670809107, "eval_loss": 0.7095398306846619, "eval_runtime": 280.0278, "eval_samples_per_second": 161.227, "eval_steps_per_second": 20.155, "step": 1440 }, { "epoch": 0.22201806767723167, "grad_norm": 20.692684173583984, "learning_rate": 4.444954830806921e-06, "loss": 0.4735, "step": 1450 }, { "epoch": 0.22201806767723167, "eval_accuracy": 0.5908092395766303, "eval_loss": 0.734937310218811, "eval_runtime": 279.9051, "eval_samples_per_second": 161.298, "eval_steps_per_second": 20.164, "step": 1450 }, { "epoch": 0.22354922676466085, "grad_norm": 17.677717208862305, "learning_rate": 4.441126933088348e-06, "loss": 0.6086, "step": 1460 }, { "epoch": 0.22354922676466085, "eval_accuracy": 0.595049395049395, "eval_loss": 0.7369093894958496, "eval_runtime": 280.5701, "eval_samples_per_second": 160.915, "eval_steps_per_second": 20.116, "step": 1460 }, { "epoch": 0.22508038585209003, "grad_norm": 15.074790954589844, "learning_rate": 4.4372990353697755e-06, "loss": 0.5995, "step": 1470 }, { "epoch": 0.22508038585209003, "eval_accuracy": 0.5964959030044634, "eval_loss": 0.71633380651474, "eval_runtime": 280.9412, "eval_samples_per_second": 160.703, "eval_steps_per_second": 20.09, "step": 1470 }, { "epoch": 0.2266115449395192, "grad_norm": 14.004373550415039, "learning_rate": 4.433471137651202e-06, "loss": 0.6036, "step": 1480 }, { "epoch": 0.2266115449395192, "eval_accuracy": 0.5984582574310214, "eval_loss": 0.7075589895248413, "eval_runtime": 279.8252, "eval_samples_per_second": 161.344, "eval_steps_per_second": 20.17, "step": 1480 }, { "epoch": 0.2281427040269484, "grad_norm": 12.327754974365234, "learning_rate": 4.42964323993263e-06, "loss": 0.6168, "step": 1490 }, { "epoch": 0.2281427040269484, "eval_accuracy": 0.5992044974779459, "eval_loss": 0.692619800567627, "eval_runtime": 280.1157, "eval_samples_per_second": 161.176, "eval_steps_per_second": 20.149, "step": 1490 }, { "epoch": 0.2296738631143776, "grad_norm": 10.927477836608887, "learning_rate": 4.425815342214056e-06, "loss": 0.5584, "step": 1500 }, { "epoch": 0.2296738631143776, "eval_accuracy": 0.5985860696738623, "eval_loss": 0.7029620409011841, "eval_runtime": 278.9125, "eval_samples_per_second": 161.872, "eval_steps_per_second": 20.236, "step": 1500 }, { "epoch": 0.23120502220180678, "grad_norm": 21.215038299560547, "learning_rate": 4.421987444495484e-06, "loss": 0.6836, "step": 1510 }, { "epoch": 0.23120502220180678, "eval_accuracy": 0.5978149842341343, "eval_loss": 0.7012072205543518, "eval_runtime": 279.942, "eval_samples_per_second": 161.276, "eval_steps_per_second": 20.161, "step": 1510 }, { "epoch": 0.23273618128923595, "grad_norm": 18.26300621032715, "learning_rate": 4.41815954677691e-06, "loss": 0.5803, "step": 1520 }, { "epoch": 0.23273618128923595, "eval_accuracy": 0.593573744282098, "eval_loss": 0.7073465585708618, "eval_runtime": 278.8484, "eval_samples_per_second": 161.909, "eval_steps_per_second": 20.24, "step": 1520 }, { "epoch": 0.23426734037666513, "grad_norm": 15.730330467224121, "learning_rate": 4.414331649058338e-06, "loss": 0.6735, "step": 1530 }, { "epoch": 0.23426734037666513, "eval_accuracy": 0.5931864173097022, "eval_loss": 0.694299578666687, "eval_runtime": 278.339, "eval_samples_per_second": 162.205, "eval_steps_per_second": 20.277, "step": 1530 }, { "epoch": 0.2357984994640943, "grad_norm": 10.599174499511719, "learning_rate": 4.4105037513397645e-06, "loss": 0.6482, "step": 1540 }, { "epoch": 0.2357984994640943, "eval_accuracy": 0.5938021401081177, "eval_loss": 0.6790253520011902, "eval_runtime": 279.1453, "eval_samples_per_second": 161.737, "eval_steps_per_second": 20.219, "step": 1540 }, { "epoch": 0.2373296585515235, "grad_norm": 9.95355224609375, "learning_rate": 4.406675853621192e-06, "loss": 0.6667, "step": 1550 }, { "epoch": 0.2373296585515235, "eval_accuracy": 0.5938640206460799, "eval_loss": 0.6704154014587402, "eval_runtime": 279.4535, "eval_samples_per_second": 161.558, "eval_steps_per_second": 20.197, "step": 1550 }, { "epoch": 0.2388608176389527, "grad_norm": 9.302884101867676, "learning_rate": 4.402847955902619e-06, "loss": 0.604, "step": 1560 }, { "epoch": 0.2388608176389527, "eval_accuracy": 0.5950828790744243, "eval_loss": 0.6687915921211243, "eval_runtime": 277.3792, "eval_samples_per_second": 162.766, "eval_steps_per_second": 20.348, "step": 1560 }, { "epoch": 0.24039197672638188, "grad_norm": 8.783987998962402, "learning_rate": 4.399020058184046e-06, "loss": 0.5914, "step": 1570 }, { "epoch": 0.24039197672638188, "eval_accuracy": 0.5949302294527408, "eval_loss": 0.6737338304519653, "eval_runtime": 277.8845, "eval_samples_per_second": 162.47, "eval_steps_per_second": 20.311, "step": 1570 }, { "epoch": 0.24192313581381106, "grad_norm": 8.757774353027344, "learning_rate": 4.395192160465473e-06, "loss": 0.629, "step": 1580 }, { "epoch": 0.24192313581381106, "eval_accuracy": 0.5952777963049423, "eval_loss": 0.6752948760986328, "eval_runtime": 279.1759, "eval_samples_per_second": 161.719, "eval_steps_per_second": 20.217, "step": 1580 }, { "epoch": 0.24345429490124024, "grad_norm": 8.354512214660645, "learning_rate": 4.3913642627469e-06, "loss": 0.6632, "step": 1590 }, { "epoch": 0.24345429490124024, "eval_accuracy": 0.5962355663336819, "eval_loss": 0.6745610237121582, "eval_runtime": 279.0496, "eval_samples_per_second": 161.792, "eval_steps_per_second": 20.226, "step": 1590 }, { "epoch": 0.24498545398866942, "grad_norm": 13.983068466186523, "learning_rate": 4.387536365028327e-06, "loss": 0.6018, "step": 1600 }, { "epoch": 0.24498545398866942, "eval_accuracy": 0.5935825309643993, "eval_loss": 0.687160849571228, "eval_runtime": 280.0465, "eval_samples_per_second": 161.216, "eval_steps_per_second": 20.154, "step": 1600 }, { "epoch": 0.2465166130760986, "grad_norm": 8.103803634643555, "learning_rate": 4.383708467309754e-06, "loss": 0.6217, "step": 1610 }, { "epoch": 0.2465166130760986, "eval_accuracy": 0.5935380578595094, "eval_loss": 0.6901026368141174, "eval_runtime": 280.8048, "eval_samples_per_second": 160.781, "eval_steps_per_second": 20.099, "step": 1610 }, { "epoch": 0.2480477721635278, "grad_norm": 9.161907196044922, "learning_rate": 4.379880569591181e-06, "loss": 0.6106, "step": 1620 }, { "epoch": 0.2480477721635278, "eval_accuracy": 0.5978512323160423, "eval_loss": 0.6946441531181335, "eval_runtime": 278.4057, "eval_samples_per_second": 162.166, "eval_steps_per_second": 20.273, "step": 1620 }, { "epoch": 0.24957893125095698, "grad_norm": 7.822539329528809, "learning_rate": 4.3760526718726085e-06, "loss": 0.693, "step": 1630 }, { "epoch": 0.24957893125095698, "eval_accuracy": 0.598020462633452, "eval_loss": 0.6881946921348572, "eval_runtime": 277.0846, "eval_samples_per_second": 162.939, "eval_steps_per_second": 20.369, "step": 1630 }, { "epoch": 0.25111009033838616, "grad_norm": 8.115804672241211, "learning_rate": 4.372224774154035e-06, "loss": 0.6638, "step": 1640 }, { "epoch": 0.25111009033838616, "eval_accuracy": 0.5966775781058632, "eval_loss": 0.6835174560546875, "eval_runtime": 277.1947, "eval_samples_per_second": 162.875, "eval_steps_per_second": 20.361, "step": 1640 }, { "epoch": 0.25264124942581534, "grad_norm": 8.402555465698242, "learning_rate": 4.368396876435462e-06, "loss": 0.5649, "step": 1650 }, { "epoch": 0.25264124942581534, "eval_accuracy": 0.5948628917378918, "eval_loss": 0.6932902336120605, "eval_runtime": 278.2801, "eval_samples_per_second": 162.239, "eval_steps_per_second": 20.282, "step": 1650 }, { "epoch": 0.2541724085132445, "grad_norm": 9.621747970581055, "learning_rate": 4.3645689787168885e-06, "loss": 0.6463, "step": 1660 }, { "epoch": 0.2541724085132445, "eval_accuracy": 0.593183788710789, "eval_loss": 0.6967864036560059, "eval_runtime": 277.2248, "eval_samples_per_second": 162.857, "eval_steps_per_second": 20.359, "step": 1660 }, { "epoch": 0.2557035676006737, "grad_norm": 17.633258819580078, "learning_rate": 4.360741080998316e-06, "loss": 0.5943, "step": 1670 }, { "epoch": 0.2557035676006737, "eval_accuracy": 0.591132348038671, "eval_loss": 0.7154887318611145, "eval_runtime": 277.3036, "eval_samples_per_second": 162.811, "eval_steps_per_second": 20.353, "step": 1670 }, { "epoch": 0.2572347266881029, "grad_norm": 16.508804321289062, "learning_rate": 4.356913183279743e-06, "loss": 0.5856, "step": 1680 }, { "epoch": 0.2572347266881029, "eval_accuracy": 0.5927422936839299, "eval_loss": 0.7325928211212158, "eval_runtime": 279.5671, "eval_samples_per_second": 161.493, "eval_steps_per_second": 20.188, "step": 1680 }, { "epoch": 0.25876588577553206, "grad_norm": 25.668621063232422, "learning_rate": 4.35308528556117e-06, "loss": 0.6454, "step": 1690 }, { "epoch": 0.25876588577553206, "eval_accuracy": 0.5932041424063291, "eval_loss": 0.7432768940925598, "eval_runtime": 281.3082, "eval_samples_per_second": 160.493, "eval_steps_per_second": 20.063, "step": 1690 }, { "epoch": 0.26029704486296124, "grad_norm": 16.12009620666504, "learning_rate": 4.349257387842597e-06, "loss": 0.597, "step": 1700 }, { "epoch": 0.26029704486296124, "eval_accuracy": 0.5940829190340909, "eval_loss": 0.7180017232894897, "eval_runtime": 281.0285, "eval_samples_per_second": 160.653, "eval_steps_per_second": 20.083, "step": 1700 }, { "epoch": 0.26182820395039047, "grad_norm": 17.72113609313965, "learning_rate": 4.345429490124024e-06, "loss": 0.624, "step": 1710 }, { "epoch": 0.26182820395039047, "eval_accuracy": 0.5941739381424987, "eval_loss": 0.7116958498954773, "eval_runtime": 279.9549, "eval_samples_per_second": 161.269, "eval_steps_per_second": 20.16, "step": 1710 }, { "epoch": 0.26335936303781965, "grad_norm": 14.417743682861328, "learning_rate": 4.341601592405451e-06, "loss": 0.5733, "step": 1720 }, { "epoch": 0.26335936303781965, "eval_accuracy": 0.5937305745493295, "eval_loss": 0.7162705063819885, "eval_runtime": 278.2449, "eval_samples_per_second": 162.26, "eval_steps_per_second": 20.284, "step": 1720 }, { "epoch": 0.26489052212524883, "grad_norm": 18.933935165405273, "learning_rate": 4.337773694686878e-06, "loss": 0.5191, "step": 1730 }, { "epoch": 0.26489052212524883, "eval_accuracy": 0.5937839937839938, "eval_loss": 0.7459293603897095, "eval_runtime": 280.6883, "eval_samples_per_second": 160.847, "eval_steps_per_second": 20.108, "step": 1730 }, { "epoch": 0.266421681212678, "grad_norm": 21.37299346923828, "learning_rate": 4.333945796968305e-06, "loss": 0.6065, "step": 1740 }, { "epoch": 0.266421681212678, "eval_accuracy": 0.5947049555047602, "eval_loss": 0.7559405565261841, "eval_runtime": 279.6788, "eval_samples_per_second": 161.428, "eval_steps_per_second": 20.18, "step": 1740 }, { "epoch": 0.2679528403001072, "grad_norm": 17.455568313598633, "learning_rate": 4.3301178992497325e-06, "loss": 0.641, "step": 1750 }, { "epoch": 0.2679528403001072, "eval_accuracy": 0.5933846529272134, "eval_loss": 0.7480175495147705, "eval_runtime": 277.648, "eval_samples_per_second": 162.609, "eval_steps_per_second": 20.328, "step": 1750 }, { "epoch": 0.26948399938753637, "grad_norm": 14.3558349609375, "learning_rate": 4.326290001531159e-06, "loss": 0.6186, "step": 1760 }, { "epoch": 0.26948399938753637, "eval_accuracy": 0.5932022659113628, "eval_loss": 0.7287299633026123, "eval_runtime": 281.2354, "eval_samples_per_second": 160.535, "eval_steps_per_second": 20.069, "step": 1760 }, { "epoch": 0.27101515847496555, "grad_norm": 10.249687194824219, "learning_rate": 4.322462103812587e-06, "loss": 0.6375, "step": 1770 }, { "epoch": 0.27101515847496555, "eval_accuracy": 0.5906849680170576, "eval_loss": 0.7209318280220032, "eval_runtime": 279.922, "eval_samples_per_second": 161.288, "eval_steps_per_second": 20.163, "step": 1770 }, { "epoch": 0.2725463175623947, "grad_norm": 13.502520561218262, "learning_rate": 4.318634206094013e-06, "loss": 0.6078, "step": 1780 }, { "epoch": 0.2725463175623947, "eval_accuracy": 0.590238919975131, "eval_loss": 0.713438868522644, "eval_runtime": 281.1596, "eval_samples_per_second": 160.578, "eval_steps_per_second": 20.074, "step": 1780 }, { "epoch": 0.2740774766498239, "grad_norm": 8.710155487060547, "learning_rate": 4.314806308375441e-06, "loss": 0.6112, "step": 1790 }, { "epoch": 0.2740774766498239, "eval_accuracy": 0.5918866080156403, "eval_loss": 0.7061217427253723, "eval_runtime": 278.6715, "eval_samples_per_second": 162.012, "eval_steps_per_second": 20.253, "step": 1790 }, { "epoch": 0.2756086357372531, "grad_norm": 12.963603973388672, "learning_rate": 4.310978410656867e-06, "loss": 0.6836, "step": 1800 }, { "epoch": 0.2756086357372531, "eval_accuracy": 0.589742449179307, "eval_loss": 0.7048377394676208, "eval_runtime": 280.3976, "eval_samples_per_second": 161.014, "eval_steps_per_second": 20.129, "step": 1800 }, { "epoch": 0.27713979482468226, "grad_norm": 18.37137794494629, "learning_rate": 4.307150512938295e-06, "loss": 0.5662, "step": 1810 }, { "epoch": 0.27713979482468226, "eval_accuracy": 0.5890812901504879, "eval_loss": 0.7051539421081543, "eval_runtime": 280.7367, "eval_samples_per_second": 160.82, "eval_steps_per_second": 20.104, "step": 1810 }, { "epoch": 0.27867095391211144, "grad_norm": 8.255058288574219, "learning_rate": 4.3033226152197215e-06, "loss": 0.6022, "step": 1820 }, { "epoch": 0.27867095391211144, "eval_accuracy": 0.5886953430501244, "eval_loss": 0.7059171199798584, "eval_runtime": 278.1826, "eval_samples_per_second": 162.296, "eval_steps_per_second": 20.289, "step": 1820 }, { "epoch": 0.2802021129995407, "grad_norm": 12.834601402282715, "learning_rate": 4.299494717501149e-06, "loss": 0.5255, "step": 1830 }, { "epoch": 0.2802021129995407, "eval_accuracy": 0.5897788828700826, "eval_loss": 0.724184513092041, "eval_runtime": 281.9681, "eval_samples_per_second": 160.117, "eval_steps_per_second": 20.016, "step": 1830 }, { "epoch": 0.28173327208696985, "grad_norm": 13.296520233154297, "learning_rate": 4.295666819782576e-06, "loss": 0.5974, "step": 1840 }, { "epoch": 0.28173327208696985, "eval_accuracy": 0.5901501208506109, "eval_loss": 0.7438974380493164, "eval_runtime": 279.7112, "eval_samples_per_second": 161.409, "eval_steps_per_second": 20.178, "step": 1840 }, { "epoch": 0.28326443117439903, "grad_norm": 14.873211860656738, "learning_rate": 4.291838922064003e-06, "loss": 0.6871, "step": 1850 }, { "epoch": 0.28326443117439903, "eval_accuracy": 0.5945105702611476, "eval_loss": 0.7173364162445068, "eval_runtime": 278.1451, "eval_samples_per_second": 162.318, "eval_steps_per_second": 20.292, "step": 1850 }, { "epoch": 0.2847955902618282, "grad_norm": 11.980530738830566, "learning_rate": 4.28801102434543e-06, "loss": 0.5518, "step": 1860 }, { "epoch": 0.2847955902618282, "eval_accuracy": 0.5945075210522808, "eval_loss": 0.7088351845741272, "eval_runtime": 281.4397, "eval_samples_per_second": 160.418, "eval_steps_per_second": 20.054, "step": 1860 }, { "epoch": 0.2863267493492574, "grad_norm": 14.939533233642578, "learning_rate": 4.2841831266268565e-06, "loss": 0.5496, "step": 1870 }, { "epoch": 0.2863267493492574, "eval_accuracy": 0.5940158599702348, "eval_loss": 0.7212331295013428, "eval_runtime": 279.5218, "eval_samples_per_second": 161.519, "eval_steps_per_second": 20.192, "step": 1870 }, { "epoch": 0.28785790843668657, "grad_norm": 15.159697532653809, "learning_rate": 4.280355228908284e-06, "loss": 0.5738, "step": 1880 }, { "epoch": 0.28785790843668657, "eval_accuracy": 0.5914114513981358, "eval_loss": 0.7385027408599854, "eval_runtime": 279.6403, "eval_samples_per_second": 161.45, "eval_steps_per_second": 20.183, "step": 1880 }, { "epoch": 0.28938906752411575, "grad_norm": 10.097131729125977, "learning_rate": 4.276527331189711e-06, "loss": 0.5, "step": 1890 }, { "epoch": 0.28938906752411575, "eval_accuracy": 0.5934275634055961, "eval_loss": 0.7404712438583374, "eval_runtime": 278.9074, "eval_samples_per_second": 161.875, "eval_steps_per_second": 20.236, "step": 1890 }, { "epoch": 0.29092022661154493, "grad_norm": 17.089492797851562, "learning_rate": 4.272699433471138e-06, "loss": 0.6033, "step": 1900 }, { "epoch": 0.29092022661154493, "eval_accuracy": 0.5967577397321032, "eval_loss": 0.7266111373901367, "eval_runtime": 280.8228, "eval_samples_per_second": 160.77, "eval_steps_per_second": 20.098, "step": 1900 }, { "epoch": 0.2924513856989741, "grad_norm": 14.520054817199707, "learning_rate": 4.268871535752565e-06, "loss": 0.5852, "step": 1910 }, { "epoch": 0.2924513856989741, "eval_accuracy": 0.5944566495794776, "eval_loss": 0.7083961367607117, "eval_runtime": 278.472, "eval_samples_per_second": 162.128, "eval_steps_per_second": 20.268, "step": 1910 }, { "epoch": 0.2939825447864033, "grad_norm": 16.736730575561523, "learning_rate": 4.265043638033992e-06, "loss": 0.6374, "step": 1920 }, { "epoch": 0.2939825447864033, "eval_accuracy": 0.5979578246392897, "eval_loss": 0.6861377358436584, "eval_runtime": 279.2609, "eval_samples_per_second": 161.67, "eval_steps_per_second": 20.21, "step": 1920 }, { "epoch": 0.29551370387383247, "grad_norm": 9.897313117980957, "learning_rate": 4.261215740315419e-06, "loss": 0.5925, "step": 1930 }, { "epoch": 0.29551370387383247, "eval_accuracy": 0.5983366600133068, "eval_loss": 0.6827172636985779, "eval_runtime": 278.8073, "eval_samples_per_second": 161.933, "eval_steps_per_second": 20.243, "step": 1930 }, { "epoch": 0.2970448629612617, "grad_norm": 7.9534478187561035, "learning_rate": 4.257387842596846e-06, "loss": 0.5634, "step": 1940 }, { "epoch": 0.2970448629612617, "eval_accuracy": 0.5988369512140986, "eval_loss": 0.684248685836792, "eval_runtime": 277.3928, "eval_samples_per_second": 162.758, "eval_steps_per_second": 20.347, "step": 1940 }, { "epoch": 0.2985760220486909, "grad_norm": 13.70839786529541, "learning_rate": 4.253559944878273e-06, "loss": 0.5783, "step": 1950 }, { "epoch": 0.2985760220486909, "eval_accuracy": 0.597880548042389, "eval_loss": 0.705771267414093, "eval_runtime": 277.0321, "eval_samples_per_second": 162.97, "eval_steps_per_second": 20.373, "step": 1950 }, { "epoch": 0.30010718113612006, "grad_norm": 18.95427703857422, "learning_rate": 4.2497320471597005e-06, "loss": 0.7029, "step": 1960 }, { "epoch": 0.30010718113612006, "eval_accuracy": 0.5943931866572036, "eval_loss": 0.7076370716094971, "eval_runtime": 279.5179, "eval_samples_per_second": 161.521, "eval_steps_per_second": 20.192, "step": 1960 }, { "epoch": 0.30163834022354924, "grad_norm": 12.317983627319336, "learning_rate": 4.245904149441127e-06, "loss": 0.562, "step": 1970 }, { "epoch": 0.30163834022354924, "eval_accuracy": 0.5903159950292917, "eval_loss": 0.6966370344161987, "eval_runtime": 278.3564, "eval_samples_per_second": 162.195, "eval_steps_per_second": 20.276, "step": 1970 }, { "epoch": 0.3031694993109784, "grad_norm": 18.507949829101562, "learning_rate": 4.242076251722555e-06, "loss": 0.6133, "step": 1980 }, { "epoch": 0.3031694993109784, "eval_accuracy": 0.5898846495119787, "eval_loss": 0.697861909866333, "eval_runtime": 276.7732, "eval_samples_per_second": 163.123, "eval_steps_per_second": 20.392, "step": 1980 }, { "epoch": 0.3047006583984076, "grad_norm": 10.3158597946167, "learning_rate": 4.238248354003981e-06, "loss": 0.5549, "step": 1990 }, { "epoch": 0.3047006583984076, "eval_accuracy": 0.5933229813664597, "eval_loss": 0.6916565299034119, "eval_runtime": 279.0507, "eval_samples_per_second": 161.791, "eval_steps_per_second": 20.226, "step": 1990 }, { "epoch": 0.3062318174858368, "grad_norm": 17.062057495117188, "learning_rate": 4.234420456285409e-06, "loss": 0.6238, "step": 2000 }, { "epoch": 0.3062318174858368, "eval_accuracy": 0.5943655723158828, "eval_loss": 0.7041603326797485, "eval_runtime": 280.3627, "eval_samples_per_second": 161.034, "eval_steps_per_second": 20.131, "step": 2000 }, { "epoch": 0.30776297657326596, "grad_norm": 7.667088985443115, "learning_rate": 4.230592558566835e-06, "loss": 0.6945, "step": 2010 }, { "epoch": 0.30776297657326596, "eval_accuracy": 0.5923155464796236, "eval_loss": 0.695047914981842, "eval_runtime": 282.0476, "eval_samples_per_second": 160.072, "eval_steps_per_second": 20.011, "step": 2010 }, { "epoch": 0.30929413566069514, "grad_norm": 13.864084243774414, "learning_rate": 4.226764660848263e-06, "loss": 0.6421, "step": 2020 }, { "epoch": 0.30929413566069514, "eval_accuracy": 0.5927388930806444, "eval_loss": 0.6951669454574585, "eval_runtime": 282.3074, "eval_samples_per_second": 159.925, "eval_steps_per_second": 19.992, "step": 2020 }, { "epoch": 0.3108252947481243, "grad_norm": 9.97375202178955, "learning_rate": 4.2229367631296895e-06, "loss": 0.5758, "step": 2030 }, { "epoch": 0.3108252947481243, "eval_accuracy": 0.5917714488825698, "eval_loss": 0.6952547430992126, "eval_runtime": 281.5694, "eval_samples_per_second": 160.344, "eval_steps_per_second": 20.045, "step": 2030 }, { "epoch": 0.3123564538355535, "grad_norm": 7.828521251678467, "learning_rate": 4.219108865411117e-06, "loss": 0.6181, "step": 2040 }, { "epoch": 0.3123564538355535, "eval_accuracy": 0.5886520097712636, "eval_loss": 0.6984680891036987, "eval_runtime": 278.5618, "eval_samples_per_second": 162.075, "eval_steps_per_second": 20.261, "step": 2040 }, { "epoch": 0.3138876129229827, "grad_norm": 10.627179145812988, "learning_rate": 4.215280967692544e-06, "loss": 0.6605, "step": 2050 }, { "epoch": 0.3138876129229827, "eval_accuracy": 0.5845493371296379, "eval_loss": 0.6960271000862122, "eval_runtime": 278.4801, "eval_samples_per_second": 162.123, "eval_steps_per_second": 20.267, "step": 2050 }, { "epoch": 0.3154187720104119, "grad_norm": 6.945221424102783, "learning_rate": 4.211453069973971e-06, "loss": 0.6138, "step": 2060 }, { "epoch": 0.3154187720104119, "eval_accuracy": 0.5852839088643645, "eval_loss": 0.6904491782188416, "eval_runtime": 276.524, "eval_samples_per_second": 163.27, "eval_steps_per_second": 20.411, "step": 2060 }, { "epoch": 0.3169499310978411, "grad_norm": 13.37806224822998, "learning_rate": 4.207625172255398e-06, "loss": 0.5744, "step": 2070 }, { "epoch": 0.3169499310978411, "eval_accuracy": 0.5887589069679682, "eval_loss": 0.6954379677772522, "eval_runtime": 275.6555, "eval_samples_per_second": 163.784, "eval_steps_per_second": 20.475, "step": 2070 }, { "epoch": 0.31848109018527027, "grad_norm": 11.931571006774902, "learning_rate": 4.203797274536825e-06, "loss": 0.5473, "step": 2080 }, { "epoch": 0.31848109018527027, "eval_accuracy": 0.589274223967694, "eval_loss": 0.7085046172142029, "eval_runtime": 276.9102, "eval_samples_per_second": 163.042, "eval_steps_per_second": 20.382, "step": 2080 }, { "epoch": 0.32001224927269944, "grad_norm": 17.946001052856445, "learning_rate": 4.199969376818252e-06, "loss": 0.6201, "step": 2090 }, { "epoch": 0.32001224927269944, "eval_accuracy": 0.5832519747936452, "eval_loss": 0.7224695086479187, "eval_runtime": 278.3015, "eval_samples_per_second": 162.227, "eval_steps_per_second": 20.28, "step": 2090 }, { "epoch": 0.3215434083601286, "grad_norm": 9.482304573059082, "learning_rate": 4.1961414790996794e-06, "loss": 0.5663, "step": 2100 }, { "epoch": 0.3215434083601286, "eval_accuracy": 0.5839268676917615, "eval_loss": 0.7226927876472473, "eval_runtime": 280.269, "eval_samples_per_second": 161.088, "eval_steps_per_second": 20.138, "step": 2100 }, { "epoch": 0.3230745674475578, "grad_norm": 10.172694206237793, "learning_rate": 4.192313581381106e-06, "loss": 0.612, "step": 2110 }, { "epoch": 0.3230745674475578, "eval_accuracy": 0.5900024405937299, "eval_loss": 0.7088232040405273, "eval_runtime": 280.1784, "eval_samples_per_second": 161.14, "eval_steps_per_second": 20.144, "step": 2110 }, { "epoch": 0.324605726534987, "grad_norm": 11.057249069213867, "learning_rate": 4.188485683662533e-06, "loss": 0.5937, "step": 2120 }, { "epoch": 0.324605726534987, "eval_accuracy": 0.5903734771320152, "eval_loss": 0.7097996473312378, "eval_runtime": 281.3342, "eval_samples_per_second": 160.478, "eval_steps_per_second": 20.062, "step": 2120 }, { "epoch": 0.32613688562241616, "grad_norm": 12.521862030029297, "learning_rate": 4.184657785943959e-06, "loss": 0.6988, "step": 2130 }, { "epoch": 0.32613688562241616, "eval_accuracy": 0.5909282466452257, "eval_loss": 0.6956667900085449, "eval_runtime": 280.7428, "eval_samples_per_second": 160.816, "eval_steps_per_second": 20.104, "step": 2130 }, { "epoch": 0.32766804470984534, "grad_norm": 13.895928382873535, "learning_rate": 4.180829888225387e-06, "loss": 0.4822, "step": 2140 }, { "epoch": 0.32766804470984534, "eval_accuracy": 0.5896343627973021, "eval_loss": 0.7213166356086731, "eval_runtime": 281.166, "eval_samples_per_second": 160.574, "eval_steps_per_second": 20.074, "step": 2140 }, { "epoch": 0.3291992037972745, "grad_norm": 11.10944938659668, "learning_rate": 4.1770019905068135e-06, "loss": 0.5878, "step": 2150 }, { "epoch": 0.3291992037972745, "eval_accuracy": 0.5907275953859805, "eval_loss": 0.742756724357605, "eval_runtime": 281.9419, "eval_samples_per_second": 160.132, "eval_steps_per_second": 20.018, "step": 2150 }, { "epoch": 0.3307303628847037, "grad_norm": 12.602340698242188, "learning_rate": 4.173174092788241e-06, "loss": 0.5722, "step": 2160 }, { "epoch": 0.3307303628847037, "eval_accuracy": 0.590145030380982, "eval_loss": 0.7571865320205688, "eval_runtime": 279.9038, "eval_samples_per_second": 161.298, "eval_steps_per_second": 20.164, "step": 2160 }, { "epoch": 0.3322615219721329, "grad_norm": 18.790254592895508, "learning_rate": 4.169346195069668e-06, "loss": 0.6094, "step": 2170 }, { "epoch": 0.3322615219721329, "eval_accuracy": 0.5902217294900222, "eval_loss": 0.7526936531066895, "eval_runtime": 280.4762, "eval_samples_per_second": 160.969, "eval_steps_per_second": 20.123, "step": 2170 }, { "epoch": 0.3337926810595621, "grad_norm": 13.405548095703125, "learning_rate": 4.165518297351095e-06, "loss": 0.693, "step": 2180 }, { "epoch": 0.3337926810595621, "eval_accuracy": 0.5901581176679307, "eval_loss": 0.7200701832771301, "eval_runtime": 281.5673, "eval_samples_per_second": 160.345, "eval_steps_per_second": 20.045, "step": 2180 }, { "epoch": 0.3353238401469913, "grad_norm": 10.354043006896973, "learning_rate": 4.161690399632522e-06, "loss": 0.499, "step": 2190 }, { "epoch": 0.3353238401469913, "eval_accuracy": 0.5892896756732774, "eval_loss": 0.721836507320404, "eval_runtime": 279.1391, "eval_samples_per_second": 161.74, "eval_steps_per_second": 20.219, "step": 2190 }, { "epoch": 0.33685499923442047, "grad_norm": 8.689166069030762, "learning_rate": 4.157862501913949e-06, "loss": 0.594, "step": 2200 }, { "epoch": 0.33685499923442047, "eval_accuracy": 0.5895724296992815, "eval_loss": 0.7207421064376831, "eval_runtime": 279.1316, "eval_samples_per_second": 161.744, "eval_steps_per_second": 20.22, "step": 2200 }, { "epoch": 0.33838615832184965, "grad_norm": 12.664347648620605, "learning_rate": 4.154034604195376e-06, "loss": 0.5292, "step": 2210 }, { "epoch": 0.33838615832184965, "eval_accuracy": 0.5918439794990127, "eval_loss": 0.7299882173538208, "eval_runtime": 281.451, "eval_samples_per_second": 160.412, "eval_steps_per_second": 20.053, "step": 2210 }, { "epoch": 0.33991731740927883, "grad_norm": 14.595951080322266, "learning_rate": 4.150206706476803e-06, "loss": 0.5728, "step": 2220 }, { "epoch": 0.33991731740927883, "eval_accuracy": 0.5933771015392805, "eval_loss": 0.7359711527824402, "eval_runtime": 281.6141, "eval_samples_per_second": 160.319, "eval_steps_per_second": 20.042, "step": 2220 }, { "epoch": 0.341448476496708, "grad_norm": 16.81365203857422, "learning_rate": 4.14637880875823e-06, "loss": 0.6216, "step": 2230 }, { "epoch": 0.341448476496708, "eval_accuracy": 0.5928677563150074, "eval_loss": 0.7266600728034973, "eval_runtime": 281.4751, "eval_samples_per_second": 160.398, "eval_steps_per_second": 20.052, "step": 2230 }, { "epoch": 0.3429796355841372, "grad_norm": 9.753067016601562, "learning_rate": 4.1425509110396575e-06, "loss": 0.5759, "step": 2240 }, { "epoch": 0.3429796355841372, "eval_accuracy": 0.5927989522519923, "eval_loss": 0.7114787697792053, "eval_runtime": 281.2888, "eval_samples_per_second": 160.504, "eval_steps_per_second": 20.065, "step": 2240 }, { "epoch": 0.34451079467156637, "grad_norm": 10.276047706604004, "learning_rate": 4.138723013321084e-06, "loss": 0.621, "step": 2250 }, { "epoch": 0.34451079467156637, "eval_accuracy": 0.5948861366360367, "eval_loss": 0.7070339918136597, "eval_runtime": 280.8031, "eval_samples_per_second": 160.782, "eval_steps_per_second": 20.099, "step": 2250 }, { "epoch": 0.34604195375899555, "grad_norm": 11.647406578063965, "learning_rate": 4.134895115602512e-06, "loss": 0.6023, "step": 2260 }, { "epoch": 0.34604195375899555, "eval_accuracy": 0.5949940087871123, "eval_loss": 0.7148999571800232, "eval_runtime": 280.6819, "eval_samples_per_second": 160.851, "eval_steps_per_second": 20.108, "step": 2260 }, { "epoch": 0.3475731128464247, "grad_norm": 9.785872459411621, "learning_rate": 4.131067217883938e-06, "loss": 0.578, "step": 2270 }, { "epoch": 0.3475731128464247, "eval_accuracy": 0.59318833174113, "eval_loss": 0.7126178741455078, "eval_runtime": 281.5251, "eval_samples_per_second": 160.369, "eval_steps_per_second": 20.048, "step": 2270 }, { "epoch": 0.3491042719338539, "grad_norm": 11.013738632202148, "learning_rate": 4.127239320165366e-06, "loss": 0.5701, "step": 2280 }, { "epoch": 0.3491042719338539, "eval_accuracy": 0.5925876549793361, "eval_loss": 0.7025783061981201, "eval_runtime": 278.114, "eval_samples_per_second": 162.336, "eval_steps_per_second": 20.294, "step": 2280 }, { "epoch": 0.3506354310212831, "grad_norm": 9.779340744018555, "learning_rate": 4.1234114224467924e-06, "loss": 0.6761, "step": 2290 }, { "epoch": 0.3506354310212831, "eval_accuracy": 0.5935654336338203, "eval_loss": 0.6881637573242188, "eval_runtime": 281.0803, "eval_samples_per_second": 160.623, "eval_steps_per_second": 20.08, "step": 2290 }, { "epoch": 0.3521665901087123, "grad_norm": 13.62732219696045, "learning_rate": 4.11958352472822e-06, "loss": 0.5771, "step": 2300 }, { "epoch": 0.3521665901087123, "eval_accuracy": 0.5967165834719911, "eval_loss": 0.6921752691268921, "eval_runtime": 278.472, "eval_samples_per_second": 162.128, "eval_steps_per_second": 20.268, "step": 2300 }, { "epoch": 0.3536977491961415, "grad_norm": 13.277196884155273, "learning_rate": 4.1157556270096466e-06, "loss": 0.6241, "step": 2310 }, { "epoch": 0.3536977491961415, "eval_accuracy": 0.5976616231086658, "eval_loss": 0.6972672939300537, "eval_runtime": 279.3077, "eval_samples_per_second": 161.643, "eval_steps_per_second": 20.207, "step": 2310 }, { "epoch": 0.3552289082835707, "grad_norm": 11.036153793334961, "learning_rate": 4.111927729291074e-06, "loss": 0.6102, "step": 2320 }, { "epoch": 0.3552289082835707, "eval_accuracy": 0.5959098571555319, "eval_loss": 0.6897289752960205, "eval_runtime": 280.1189, "eval_samples_per_second": 161.174, "eval_steps_per_second": 20.149, "step": 2320 }, { "epoch": 0.35676006737099986, "grad_norm": 16.50404167175293, "learning_rate": 4.108099831572501e-06, "loss": 0.5876, "step": 2330 }, { "epoch": 0.35676006737099986, "eval_accuracy": 0.595568665720369, "eval_loss": 0.6913372874259949, "eval_runtime": 279.8966, "eval_samples_per_second": 161.302, "eval_steps_per_second": 20.165, "step": 2330 }, { "epoch": 0.35829122645842904, "grad_norm": 10.642626762390137, "learning_rate": 4.104271933853927e-06, "loss": 0.651, "step": 2340 }, { "epoch": 0.35829122645842904, "eval_accuracy": 0.5946874792133212, "eval_loss": 0.6878921389579773, "eval_runtime": 280.9394, "eval_samples_per_second": 160.704, "eval_steps_per_second": 20.09, "step": 2340 }, { "epoch": 0.3598223855458582, "grad_norm": 13.040077209472656, "learning_rate": 4.100444036135355e-06, "loss": 0.5587, "step": 2350 }, { "epoch": 0.3598223855458582, "eval_accuracy": 0.5935343584281879, "eval_loss": 0.6936639547348022, "eval_runtime": 282.096, "eval_samples_per_second": 160.045, "eval_steps_per_second": 20.007, "step": 2350 }, { "epoch": 0.3613535446332874, "grad_norm": 10.807535171508789, "learning_rate": 4.0966161384167815e-06, "loss": 0.6514, "step": 2360 }, { "epoch": 0.3613535446332874, "eval_accuracy": 0.5977954711792233, "eval_loss": 0.6872532963752747, "eval_runtime": 279.4925, "eval_samples_per_second": 161.536, "eval_steps_per_second": 20.194, "step": 2360 }, { "epoch": 0.3628847037207166, "grad_norm": 10.98725700378418, "learning_rate": 4.092788240698209e-06, "loss": 0.6015, "step": 2370 }, { "epoch": 0.3628847037207166, "eval_accuracy": 0.5974941789555384, "eval_loss": 0.6847018003463745, "eval_runtime": 281.2012, "eval_samples_per_second": 160.554, "eval_steps_per_second": 20.071, "step": 2370 }, { "epoch": 0.36441586280814575, "grad_norm": 12.160524368286133, "learning_rate": 4.088960342979636e-06, "loss": 0.5671, "step": 2380 }, { "epoch": 0.36441586280814575, "eval_accuracy": 0.598935344349562, "eval_loss": 0.6907532811164856, "eval_runtime": 281.9981, "eval_samples_per_second": 160.1, "eval_steps_per_second": 20.014, "step": 2380 }, { "epoch": 0.36594702189557493, "grad_norm": 12.533185005187988, "learning_rate": 4.085132445261063e-06, "loss": 0.6757, "step": 2390 }, { "epoch": 0.36594702189557493, "eval_accuracy": 0.5970619563287769, "eval_loss": 0.6963858008384705, "eval_runtime": 281.702, "eval_samples_per_second": 160.269, "eval_steps_per_second": 20.035, "step": 2390 }, { "epoch": 0.3674781809830041, "grad_norm": 11.481986045837402, "learning_rate": 4.08130454754249e-06, "loss": 0.6244, "step": 2400 }, { "epoch": 0.3674781809830041, "eval_accuracy": 0.5956671480946562, "eval_loss": 0.6951790452003479, "eval_runtime": 281.7788, "eval_samples_per_second": 160.225, "eval_steps_per_second": 20.03, "step": 2400 }, { "epoch": 0.36900934007043334, "grad_norm": 15.283388137817383, "learning_rate": 4.077476649823917e-06, "loss": 0.5761, "step": 2410 }, { "epoch": 0.36900934007043334, "eval_accuracy": 0.5921616520484068, "eval_loss": 0.7129482626914978, "eval_runtime": 280.9999, "eval_samples_per_second": 160.669, "eval_steps_per_second": 20.085, "step": 2410 }, { "epoch": 0.3705404991578625, "grad_norm": 14.590538024902344, "learning_rate": 4.073648752105344e-06, "loss": 0.5847, "step": 2420 }, { "epoch": 0.3705404991578625, "eval_accuracy": 0.5910292582142936, "eval_loss": 0.7289432287216187, "eval_runtime": 281.5113, "eval_samples_per_second": 160.377, "eval_steps_per_second": 20.049, "step": 2420 }, { "epoch": 0.3720716582452917, "grad_norm": 14.669201850891113, "learning_rate": 4.069820854386771e-06, "loss": 0.5957, "step": 2430 }, { "epoch": 0.3720716582452917, "eval_accuracy": 0.5891025356365736, "eval_loss": 0.7361324429512024, "eval_runtime": 278.169, "eval_samples_per_second": 162.304, "eval_steps_per_second": 20.29, "step": 2430 }, { "epoch": 0.3736028173327209, "grad_norm": 9.489580154418945, "learning_rate": 4.065992956668198e-06, "loss": 0.5718, "step": 2440 }, { "epoch": 0.3736028173327209, "eval_accuracy": 0.5889370209930024, "eval_loss": 0.7279490828514099, "eval_runtime": 277.7775, "eval_samples_per_second": 162.533, "eval_steps_per_second": 20.318, "step": 2440 }, { "epoch": 0.37513397642015006, "grad_norm": 15.029380798339844, "learning_rate": 4.0621650589496255e-06, "loss": 0.6081, "step": 2450 }, { "epoch": 0.37513397642015006, "eval_accuracy": 0.5909000155289837, "eval_loss": 0.72515469789505, "eval_runtime": 280.1896, "eval_samples_per_second": 161.134, "eval_steps_per_second": 20.143, "step": 2450 }, { "epoch": 0.37666513550757924, "grad_norm": 12.974061965942383, "learning_rate": 4.058337161231052e-06, "loss": 0.5805, "step": 2460 }, { "epoch": 0.37666513550757924, "eval_accuracy": 0.5923000110950849, "eval_loss": 0.7263885736465454, "eval_runtime": 277.833, "eval_samples_per_second": 162.501, "eval_steps_per_second": 20.314, "step": 2460 }, { "epoch": 0.3781962945950084, "grad_norm": 17.26422119140625, "learning_rate": 4.05450926351248e-06, "loss": 0.6574, "step": 2470 }, { "epoch": 0.3781962945950084, "eval_accuracy": 0.5921569497769591, "eval_loss": 0.7078375816345215, "eval_runtime": 278.2251, "eval_samples_per_second": 162.271, "eval_steps_per_second": 20.286, "step": 2470 }, { "epoch": 0.3797274536824376, "grad_norm": 13.827315330505371, "learning_rate": 4.050681365793906e-06, "loss": 0.6347, "step": 2480 }, { "epoch": 0.3797274536824376, "eval_accuracy": 0.5945813901843215, "eval_loss": 0.700303316116333, "eval_runtime": 280.284, "eval_samples_per_second": 161.079, "eval_steps_per_second": 20.137, "step": 2480 }, { "epoch": 0.3812586127698668, "grad_norm": 12.102642059326172, "learning_rate": 4.046853468075334e-06, "loss": 0.6385, "step": 2490 }, { "epoch": 0.3812586127698668, "eval_accuracy": 0.5984858576439768, "eval_loss": 0.6862630844116211, "eval_runtime": 277.6537, "eval_samples_per_second": 162.605, "eval_steps_per_second": 20.327, "step": 2490 }, { "epoch": 0.38278977185729596, "grad_norm": 8.007050514221191, "learning_rate": 4.04302557035676e-06, "loss": 0.5878, "step": 2500 }, { "epoch": 0.38278977185729596, "eval_accuracy": 0.6000088768558177, "eval_loss": 0.6816014647483826, "eval_runtime": 278.6731, "eval_samples_per_second": 162.011, "eval_steps_per_second": 20.253, "step": 2500 } ], "logging_steps": 10, "max_steps": 13062, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }