{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 14.211126327514648, "learning_rate": 6.000000000000001e-08, "loss": 0.9806, "step": 10 }, { "epoch": 0.004, "grad_norm": 8.715738296508789, "learning_rate": 1.2666666666666666e-07, "loss": 0.9899, "step": 20 }, { "epoch": 0.006, "grad_norm": 10.858619689941406, "learning_rate": 1.9333333333333337e-07, "loss": 1.0494, "step": 30 }, { "epoch": 0.008, "grad_norm": 11.522907257080078, "learning_rate": 2.6e-07, "loss": 0.9985, "step": 40 }, { "epoch": 0.01, "grad_norm": 11.823868751525879, "learning_rate": 3.266666666666667e-07, "loss": 0.9687, "step": 50 }, { "epoch": 0.012, "grad_norm": 10.01672077178955, "learning_rate": 3.9333333333333336e-07, "loss": 0.7783, "step": 60 }, { "epoch": 0.014, "grad_norm": 8.285725593566895, "learning_rate": 4.6000000000000004e-07, "loss": 0.8029, "step": 70 }, { "epoch": 0.016, "grad_norm": 3.7833917140960693, "learning_rate": 5.266666666666667e-07, "loss": 0.6355, "step": 80 }, { "epoch": 0.018, "grad_norm": 3.166145086288452, "learning_rate": 5.933333333333334e-07, "loss": 0.5781, "step": 90 }, { "epoch": 0.02, "grad_norm": 2.388826847076416, "learning_rate": 6.6e-07, "loss": 0.5602, "step": 100 }, { "epoch": 0.022, "grad_norm": 2.683410167694092, "learning_rate": 7.266666666666668e-07, "loss": 0.5822, "step": 110 }, { "epoch": 0.024, "grad_norm": 2.4587814807891846, "learning_rate": 7.933333333333335e-07, "loss": 0.4702, "step": 120 }, { "epoch": 0.026, "grad_norm": 2.512497663497925, "learning_rate": 8.6e-07, "loss": 0.5497, "step": 130 }, { "epoch": 0.028, "grad_norm": 1.4995685815811157, "learning_rate": 9.266666666666667e-07, "loss": 0.5195, "step": 140 }, { "epoch": 0.03, "grad_norm": 2.7800076007843018, "learning_rate": 9.933333333333333e-07, "loss": 0.4665, "step": 150 }, { "epoch": 0.032, "grad_norm": 1.491605281829834, "learning_rate": 1.06e-06, "loss": 0.5083, "step": 160 }, { "epoch": 0.034, "grad_norm": 1.643656611442566, "learning_rate": 1.1266666666666667e-06, "loss": 0.4549, "step": 170 }, { "epoch": 0.036, "grad_norm": 2.509424924850464, "learning_rate": 1.1933333333333335e-06, "loss": 0.4816, "step": 180 }, { "epoch": 0.038, "grad_norm": 1.9511481523513794, "learning_rate": 1.26e-06, "loss": 0.5072, "step": 190 }, { "epoch": 0.04, "grad_norm": 2.516787052154541, "learning_rate": 1.3266666666666667e-06, "loss": 0.497, "step": 200 }, { "epoch": 0.042, "grad_norm": 2.568359851837158, "learning_rate": 1.3933333333333335e-06, "loss": 0.485, "step": 210 }, { "epoch": 0.044, "grad_norm": 1.8086744546890259, "learning_rate": 1.46e-06, "loss": 0.5004, "step": 220 }, { "epoch": 0.046, "grad_norm": 1.9711308479309082, "learning_rate": 1.526666666666667e-06, "loss": 0.502, "step": 230 }, { "epoch": 0.048, "grad_norm": 1.787848949432373, "learning_rate": 1.5933333333333335e-06, "loss": 0.5269, "step": 240 }, { "epoch": 0.05, "grad_norm": 1.7853062152862549, "learning_rate": 1.6600000000000002e-06, "loss": 0.4902, "step": 250 }, { "epoch": 0.052, "grad_norm": 1.663499355316162, "learning_rate": 1.7266666666666667e-06, "loss": 0.5142, "step": 260 }, { "epoch": 0.054, "grad_norm": 1.4939693212509155, "learning_rate": 1.7933333333333337e-06, "loss": 0.4769, "step": 270 }, { "epoch": 0.056, "grad_norm": 1.9765783548355103, "learning_rate": 1.8600000000000002e-06, "loss": 0.4955, "step": 280 }, { "epoch": 0.058, "grad_norm": 1.5771379470825195, "learning_rate": 1.926666666666667e-06, "loss": 0.4419, "step": 290 }, { "epoch": 0.06, "grad_norm": 1.0956958532333374, "learning_rate": 1.9933333333333334e-06, "loss": 0.4221, "step": 300 }, { "epoch": 0.062, "grad_norm": 2.1830596923828125, "learning_rate": 2.06e-06, "loss": 0.5702, "step": 310 }, { "epoch": 0.064, "grad_norm": 2.9908502101898193, "learning_rate": 2.126666666666667e-06, "loss": 0.5233, "step": 320 }, { "epoch": 0.066, "grad_norm": 1.752899169921875, "learning_rate": 2.1933333333333332e-06, "loss": 0.4822, "step": 330 }, { "epoch": 0.068, "grad_norm": 1.7774080038070679, "learning_rate": 2.2600000000000004e-06, "loss": 0.4571, "step": 340 }, { "epoch": 0.07, "grad_norm": 2.2225241661071777, "learning_rate": 2.3266666666666667e-06, "loss": 0.3769, "step": 350 }, { "epoch": 0.072, "grad_norm": 1.7270737886428833, "learning_rate": 2.3933333333333334e-06, "loss": 0.4333, "step": 360 }, { "epoch": 0.074, "grad_norm": 1.9639253616333008, "learning_rate": 2.46e-06, "loss": 0.494, "step": 370 }, { "epoch": 0.076, "grad_norm": 1.4829277992248535, "learning_rate": 2.526666666666667e-06, "loss": 0.455, "step": 380 }, { "epoch": 0.078, "grad_norm": 2.67972469329834, "learning_rate": 2.5933333333333336e-06, "loss": 0.4865, "step": 390 }, { "epoch": 0.08, "grad_norm": 2.0642127990722656, "learning_rate": 2.6600000000000004e-06, "loss": 0.4665, "step": 400 }, { "epoch": 0.082, "grad_norm": 1.5665631294250488, "learning_rate": 2.726666666666667e-06, "loss": 0.4586, "step": 410 }, { "epoch": 0.084, "grad_norm": 1.1839872598648071, "learning_rate": 2.7933333333333334e-06, "loss": 0.4541, "step": 420 }, { "epoch": 0.086, "grad_norm": 1.2557004690170288, "learning_rate": 2.86e-06, "loss": 0.4357, "step": 430 }, { "epoch": 0.088, "grad_norm": 1.4764341115951538, "learning_rate": 2.9266666666666673e-06, "loss": 0.5011, "step": 440 }, { "epoch": 0.09, "grad_norm": 1.9956254959106445, "learning_rate": 2.9933333333333336e-06, "loss": 0.4933, "step": 450 }, { "epoch": 0.092, "grad_norm": 1.6207813024520874, "learning_rate": 3.0600000000000003e-06, "loss": 0.4402, "step": 460 }, { "epoch": 0.094, "grad_norm": 1.7665313482284546, "learning_rate": 3.1266666666666667e-06, "loss": 0.4392, "step": 470 }, { "epoch": 0.096, "grad_norm": 1.76856529712677, "learning_rate": 3.193333333333334e-06, "loss": 0.4521, "step": 480 }, { "epoch": 0.098, "grad_norm": 1.7528122663497925, "learning_rate": 3.2600000000000006e-06, "loss": 0.4168, "step": 490 }, { "epoch": 0.1, "grad_norm": 1.3086860179901123, "learning_rate": 3.326666666666667e-06, "loss": 0.4228, "step": 500 }, { "epoch": 0.102, "grad_norm": 2.676274061203003, "learning_rate": 3.3933333333333336e-06, "loss": 0.4733, "step": 510 }, { "epoch": 0.104, "grad_norm": 1.9702718257904053, "learning_rate": 3.46e-06, "loss": 0.4273, "step": 520 }, { "epoch": 0.106, "grad_norm": 1.8276294469833374, "learning_rate": 3.526666666666667e-06, "loss": 0.491, "step": 530 }, { "epoch": 0.108, "grad_norm": 3.189826726913452, "learning_rate": 3.593333333333334e-06, "loss": 0.5354, "step": 540 }, { "epoch": 0.11, "grad_norm": 1.3039813041687012, "learning_rate": 3.66e-06, "loss": 0.4561, "step": 550 }, { "epoch": 0.112, "grad_norm": 1.4156097173690796, "learning_rate": 3.726666666666667e-06, "loss": 0.3896, "step": 560 }, { "epoch": 0.114, "grad_norm": 1.6464276313781738, "learning_rate": 3.793333333333334e-06, "loss": 0.4438, "step": 570 }, { "epoch": 0.116, "grad_norm": 1.4432697296142578, "learning_rate": 3.86e-06, "loss": 0.5175, "step": 580 }, { "epoch": 0.118, "grad_norm": 1.6621713638305664, "learning_rate": 3.926666666666667e-06, "loss": 0.4429, "step": 590 }, { "epoch": 0.12, "grad_norm": 1.6414673328399658, "learning_rate": 3.993333333333334e-06, "loss": 0.4958, "step": 600 }, { "epoch": 0.122, "grad_norm": 1.51468026638031, "learning_rate": 4.060000000000001e-06, "loss": 0.4636, "step": 610 }, { "epoch": 0.124, "grad_norm": 1.9095144271850586, "learning_rate": 4.126666666666667e-06, "loss": 0.4545, "step": 620 }, { "epoch": 0.126, "grad_norm": 1.6873503923416138, "learning_rate": 4.1933333333333336e-06, "loss": 0.4203, "step": 630 }, { "epoch": 0.128, "grad_norm": 1.8741629123687744, "learning_rate": 4.26e-06, "loss": 0.4713, "step": 640 }, { "epoch": 0.13, "grad_norm": 1.5286614894866943, "learning_rate": 4.326666666666667e-06, "loss": 0.3901, "step": 650 }, { "epoch": 0.132, "grad_norm": 1.4989817142486572, "learning_rate": 4.393333333333334e-06, "loss": 0.4117, "step": 660 }, { "epoch": 0.134, "grad_norm": 1.642307996749878, "learning_rate": 4.4600000000000005e-06, "loss": 0.5395, "step": 670 }, { "epoch": 0.136, "grad_norm": 2.079261064529419, "learning_rate": 4.526666666666667e-06, "loss": 0.4314, "step": 680 }, { "epoch": 0.138, "grad_norm": 1.440948486328125, "learning_rate": 4.593333333333333e-06, "loss": 0.451, "step": 690 }, { "epoch": 0.14, "grad_norm": 1.213745355606079, "learning_rate": 4.66e-06, "loss": 0.4941, "step": 700 }, { "epoch": 0.142, "grad_norm": 2.1743085384368896, "learning_rate": 4.7266666666666674e-06, "loss": 0.4749, "step": 710 }, { "epoch": 0.144, "grad_norm": 1.6815190315246582, "learning_rate": 4.793333333333334e-06, "loss": 0.606, "step": 720 }, { "epoch": 0.146, "grad_norm": 1.5993754863739014, "learning_rate": 4.86e-06, "loss": 0.4041, "step": 730 }, { "epoch": 0.148, "grad_norm": 2.1879842281341553, "learning_rate": 4.926666666666667e-06, "loss": 0.482, "step": 740 }, { "epoch": 0.15, "grad_norm": 1.3680864572525024, "learning_rate": 4.9933333333333335e-06, "loss": 0.4598, "step": 750 }, { "epoch": 0.152, "grad_norm": 1.4133329391479492, "learning_rate": 5.060000000000001e-06, "loss": 0.4606, "step": 760 }, { "epoch": 0.154, "grad_norm": 1.6556570529937744, "learning_rate": 5.126666666666668e-06, "loss": 0.3716, "step": 770 }, { "epoch": 0.156, "grad_norm": 1.5880504846572876, "learning_rate": 5.193333333333333e-06, "loss": 0.5006, "step": 780 }, { "epoch": 0.158, "grad_norm": 1.4626226425170898, "learning_rate": 5.2600000000000005e-06, "loss": 0.4802, "step": 790 }, { "epoch": 0.16, "grad_norm": 1.9758862257003784, "learning_rate": 5.326666666666667e-06, "loss": 0.4725, "step": 800 }, { "epoch": 0.162, "grad_norm": 1.7073544263839722, "learning_rate": 5.393333333333334e-06, "loss": 0.412, "step": 810 }, { "epoch": 0.164, "grad_norm": 1.9127854108810425, "learning_rate": 5.460000000000001e-06, "loss": 0.4806, "step": 820 }, { "epoch": 0.166, "grad_norm": 1.452903389930725, "learning_rate": 5.5266666666666666e-06, "loss": 0.4171, "step": 830 }, { "epoch": 0.168, "grad_norm": 1.603830337524414, "learning_rate": 5.593333333333334e-06, "loss": 0.4348, "step": 840 }, { "epoch": 0.17, "grad_norm": 1.7848249673843384, "learning_rate": 5.66e-06, "loss": 0.4739, "step": 850 }, { "epoch": 0.172, "grad_norm": 1.8033798933029175, "learning_rate": 5.726666666666667e-06, "loss": 0.4251, "step": 860 }, { "epoch": 0.174, "grad_norm": 1.6332205533981323, "learning_rate": 5.793333333333334e-06, "loss": 0.4422, "step": 870 }, { "epoch": 0.176, "grad_norm": 1.8807281255722046, "learning_rate": 5.86e-06, "loss": 0.4224, "step": 880 }, { "epoch": 0.178, "grad_norm": 1.7026886940002441, "learning_rate": 5.926666666666667e-06, "loss": 0.4299, "step": 890 }, { "epoch": 0.18, "grad_norm": 1.69607412815094, "learning_rate": 5.993333333333334e-06, "loss": 0.5092, "step": 900 }, { "epoch": 0.182, "grad_norm": 1.3589431047439575, "learning_rate": 6.0600000000000004e-06, "loss": 0.399, "step": 910 }, { "epoch": 0.184, "grad_norm": 1.3969651460647583, "learning_rate": 6.126666666666668e-06, "loss": 0.5225, "step": 920 }, { "epoch": 0.186, "grad_norm": 1.8621569871902466, "learning_rate": 6.193333333333333e-06, "loss": 0.4389, "step": 930 }, { "epoch": 0.188, "grad_norm": 1.6221462488174438, "learning_rate": 6.26e-06, "loss": 0.4367, "step": 940 }, { "epoch": 0.19, "grad_norm": 1.452337622642517, "learning_rate": 6.326666666666667e-06, "loss": 0.452, "step": 950 }, { "epoch": 0.192, "grad_norm": 1.5459558963775635, "learning_rate": 6.393333333333334e-06, "loss": 0.5254, "step": 960 }, { "epoch": 0.194, "grad_norm": 1.353761076927185, "learning_rate": 6.460000000000001e-06, "loss": 0.4755, "step": 970 }, { "epoch": 0.196, "grad_norm": 1.3966904878616333, "learning_rate": 6.526666666666666e-06, "loss": 0.4842, "step": 980 }, { "epoch": 0.198, "grad_norm": 1.3571563959121704, "learning_rate": 6.5933333333333335e-06, "loss": 0.4776, "step": 990 }, { "epoch": 0.2, "grad_norm": 1.7380127906799316, "learning_rate": 6.660000000000001e-06, "loss": 0.5087, "step": 1000 }, { "epoch": 0.202, "grad_norm": 1.30695378780365, "learning_rate": 6.726666666666667e-06, "loss": 0.4261, "step": 1010 }, { "epoch": 0.204, "grad_norm": 1.4552329778671265, "learning_rate": 6.793333333333334e-06, "loss": 0.4485, "step": 1020 }, { "epoch": 0.206, "grad_norm": 1.4665312767028809, "learning_rate": 6.860000000000001e-06, "loss": 0.4326, "step": 1030 }, { "epoch": 0.208, "grad_norm": 1.511225938796997, "learning_rate": 6.926666666666667e-06, "loss": 0.4234, "step": 1040 }, { "epoch": 0.21, "grad_norm": 1.2507603168487549, "learning_rate": 6.993333333333334e-06, "loss": 0.3983, "step": 1050 }, { "epoch": 0.212, "grad_norm": 1.8667805194854736, "learning_rate": 7.06e-06, "loss": 0.4798, "step": 1060 }, { "epoch": 0.214, "grad_norm": 1.400815725326538, "learning_rate": 7.126666666666667e-06, "loss": 0.4947, "step": 1070 }, { "epoch": 0.216, "grad_norm": 1.2267810106277466, "learning_rate": 7.1933333333333345e-06, "loss": 0.4703, "step": 1080 }, { "epoch": 0.218, "grad_norm": 1.2562822103500366, "learning_rate": 7.260000000000001e-06, "loss": 0.4272, "step": 1090 }, { "epoch": 0.22, "grad_norm": 1.6255943775177002, "learning_rate": 7.326666666666667e-06, "loss": 0.5082, "step": 1100 }, { "epoch": 0.222, "grad_norm": 1.6352773904800415, "learning_rate": 7.393333333333333e-06, "loss": 0.4679, "step": 1110 }, { "epoch": 0.224, "grad_norm": 1.4105769395828247, "learning_rate": 7.4600000000000006e-06, "loss": 0.4716, "step": 1120 }, { "epoch": 0.226, "grad_norm": 1.766946792602539, "learning_rate": 7.526666666666668e-06, "loss": 0.5016, "step": 1130 }, { "epoch": 0.228, "grad_norm": 1.1533312797546387, "learning_rate": 7.593333333333334e-06, "loss": 0.4691, "step": 1140 }, { "epoch": 0.23, "grad_norm": 1.6040184497833252, "learning_rate": 7.660000000000001e-06, "loss": 0.4123, "step": 1150 }, { "epoch": 0.232, "grad_norm": 1.3271604776382446, "learning_rate": 7.726666666666667e-06, "loss": 0.4571, "step": 1160 }, { "epoch": 0.234, "grad_norm": 1.7813302278518677, "learning_rate": 7.793333333333334e-06, "loss": 0.4868, "step": 1170 }, { "epoch": 0.236, "grad_norm": 1.5846160650253296, "learning_rate": 7.860000000000001e-06, "loss": 0.4237, "step": 1180 }, { "epoch": 0.238, "grad_norm": 1.4138127565383911, "learning_rate": 7.926666666666666e-06, "loss": 0.4485, "step": 1190 }, { "epoch": 0.24, "grad_norm": 1.7159594297409058, "learning_rate": 7.993333333333334e-06, "loss": 0.5088, "step": 1200 }, { "epoch": 0.242, "grad_norm": 1.5566740036010742, "learning_rate": 8.06e-06, "loss": 0.4248, "step": 1210 }, { "epoch": 0.244, "grad_norm": 1.4056092500686646, "learning_rate": 8.126666666666668e-06, "loss": 0.4242, "step": 1220 }, { "epoch": 0.246, "grad_norm": 1.2954442501068115, "learning_rate": 8.193333333333335e-06, "loss": 0.479, "step": 1230 }, { "epoch": 0.248, "grad_norm": 1.576978325843811, "learning_rate": 8.26e-06, "loss": 0.4207, "step": 1240 }, { "epoch": 0.25, "grad_norm": 1.4118748903274536, "learning_rate": 8.326666666666668e-06, "loss": 0.4451, "step": 1250 }, { "epoch": 0.252, "grad_norm": 1.3957098722457886, "learning_rate": 8.393333333333335e-06, "loss": 0.4955, "step": 1260 }, { "epoch": 0.254, "grad_norm": 1.2895923852920532, "learning_rate": 8.46e-06, "loss": 0.4241, "step": 1270 }, { "epoch": 0.256, "grad_norm": 1.5065820217132568, "learning_rate": 8.526666666666667e-06, "loss": 0.4727, "step": 1280 }, { "epoch": 0.258, "grad_norm": 1.086529016494751, "learning_rate": 8.593333333333333e-06, "loss": 0.4429, "step": 1290 }, { "epoch": 0.26, "grad_norm": 1.2884867191314697, "learning_rate": 8.66e-06, "loss": 0.4496, "step": 1300 }, { "epoch": 0.262, "grad_norm": 1.344300627708435, "learning_rate": 8.726666666666667e-06, "loss": 0.3874, "step": 1310 }, { "epoch": 0.264, "grad_norm": 1.2245051860809326, "learning_rate": 8.793333333333334e-06, "loss": 0.5057, "step": 1320 }, { "epoch": 0.266, "grad_norm": 1.3924754858016968, "learning_rate": 8.860000000000002e-06, "loss": 0.4008, "step": 1330 }, { "epoch": 0.268, "grad_norm": 1.4217268228530884, "learning_rate": 8.926666666666669e-06, "loss": 0.4915, "step": 1340 }, { "epoch": 0.27, "grad_norm": 0.9328693747520447, "learning_rate": 8.993333333333334e-06, "loss": 0.4044, "step": 1350 }, { "epoch": 0.272, "grad_norm": 1.2598109245300293, "learning_rate": 9.060000000000001e-06, "loss": 0.4407, "step": 1360 }, { "epoch": 0.274, "grad_norm": 1.253463625907898, "learning_rate": 9.126666666666667e-06, "loss": 0.4671, "step": 1370 }, { "epoch": 0.276, "grad_norm": 1.41267728805542, "learning_rate": 9.193333333333334e-06, "loss": 0.5014, "step": 1380 }, { "epoch": 0.278, "grad_norm": 1.285569190979004, "learning_rate": 9.260000000000001e-06, "loss": 0.5425, "step": 1390 }, { "epoch": 0.28, "grad_norm": 1.9558253288269043, "learning_rate": 9.326666666666667e-06, "loss": 0.4851, "step": 1400 }, { "epoch": 0.282, "grad_norm": 1.1378921270370483, "learning_rate": 9.393333333333334e-06, "loss": 0.5159, "step": 1410 }, { "epoch": 0.284, "grad_norm": 1.152918815612793, "learning_rate": 9.460000000000001e-06, "loss": 0.4235, "step": 1420 }, { "epoch": 0.286, "grad_norm": 1.405306339263916, "learning_rate": 9.526666666666668e-06, "loss": 0.4881, "step": 1430 }, { "epoch": 0.288, "grad_norm": 1.4959778785705566, "learning_rate": 9.593333333333335e-06, "loss": 0.4729, "step": 1440 }, { "epoch": 0.29, "grad_norm": 1.3399648666381836, "learning_rate": 9.66e-06, "loss": 0.4554, "step": 1450 }, { "epoch": 0.292, "grad_norm": 1.1871027946472168, "learning_rate": 9.726666666666668e-06, "loss": 0.4591, "step": 1460 }, { "epoch": 0.294, "grad_norm": 1.196593165397644, "learning_rate": 9.793333333333333e-06, "loss": 0.5592, "step": 1470 }, { "epoch": 0.296, "grad_norm": 1.4280208349227905, "learning_rate": 9.86e-06, "loss": 0.4551, "step": 1480 }, { "epoch": 0.298, "grad_norm": 1.1892995834350586, "learning_rate": 9.926666666666668e-06, "loss": 0.4864, "step": 1490 }, { "epoch": 0.3, "grad_norm": 1.3978432416915894, "learning_rate": 9.993333333333333e-06, "loss": 0.5178, "step": 1500 }, { "epoch": 0.302, "grad_norm": 1.2265063524246216, "learning_rate": 9.999989033776898e-06, "loss": 0.446, "step": 1510 }, { "epoch": 0.304, "grad_norm": 1.7466825246810913, "learning_rate": 9.999951125906936e-06, "loss": 0.5599, "step": 1520 }, { "epoch": 0.306, "grad_norm": 1.2893444299697876, "learning_rate": 9.999886141209892e-06, "loss": 0.5088, "step": 1530 }, { "epoch": 0.308, "grad_norm": 1.459552526473999, "learning_rate": 9.999794080037675e-06, "loss": 0.4981, "step": 1540 }, { "epoch": 0.31, "grad_norm": 1.654597520828247, "learning_rate": 9.99967494288884e-06, "loss": 0.561, "step": 1550 }, { "epoch": 0.312, "grad_norm": 1.3994464874267578, "learning_rate": 9.999528730408565e-06, "loss": 0.4872, "step": 1560 }, { "epoch": 0.314, "grad_norm": 1.6109237670898438, "learning_rate": 9.999355443388649e-06, "loss": 0.4748, "step": 1570 }, { "epoch": 0.316, "grad_norm": 1.2756563425064087, "learning_rate": 9.999155082767515e-06, "loss": 0.4265, "step": 1580 }, { "epoch": 0.318, "grad_norm": 1.2647457122802734, "learning_rate": 9.998927649630202e-06, "loss": 0.4898, "step": 1590 }, { "epoch": 0.32, "grad_norm": 1.2949209213256836, "learning_rate": 9.998673145208351e-06, "loss": 0.4366, "step": 1600 }, { "epoch": 0.322, "grad_norm": 1.2482858896255493, "learning_rate": 9.998391570880212e-06, "loss": 0.5014, "step": 1610 }, { "epoch": 0.324, "grad_norm": 1.0791841745376587, "learning_rate": 9.99808292817063e-06, "loss": 0.4954, "step": 1620 }, { "epoch": 0.326, "grad_norm": 1.4692087173461914, "learning_rate": 9.997747218751032e-06, "loss": 0.5007, "step": 1630 }, { "epoch": 0.328, "grad_norm": 1.5948970317840576, "learning_rate": 9.997384444439424e-06, "loss": 0.4542, "step": 1640 }, { "epoch": 0.33, "grad_norm": 1.2097396850585938, "learning_rate": 9.996994607200382e-06, "loss": 0.4525, "step": 1650 }, { "epoch": 0.332, "grad_norm": 1.343247413635254, "learning_rate": 9.99657770914504e-06, "loss": 0.4434, "step": 1660 }, { "epoch": 0.334, "grad_norm": 1.5379422903060913, "learning_rate": 9.996133752531071e-06, "loss": 0.5859, "step": 1670 }, { "epoch": 0.336, "grad_norm": 1.158941388130188, "learning_rate": 9.99566273976269e-06, "loss": 0.495, "step": 1680 }, { "epoch": 0.338, "grad_norm": 1.1630929708480835, "learning_rate": 9.995164673390624e-06, "loss": 0.5463, "step": 1690 }, { "epoch": 0.34, "grad_norm": 1.256372094154358, "learning_rate": 9.994639556112113e-06, "loss": 0.4853, "step": 1700 }, { "epoch": 0.342, "grad_norm": 1.8223704099655151, "learning_rate": 9.994087390770887e-06, "loss": 0.4836, "step": 1710 }, { "epoch": 0.344, "grad_norm": 1.945887565612793, "learning_rate": 9.993508180357154e-06, "loss": 0.5115, "step": 1720 }, { "epoch": 0.346, "grad_norm": 1.6152077913284302, "learning_rate": 9.992901928007577e-06, "loss": 0.4939, "step": 1730 }, { "epoch": 0.348, "grad_norm": 1.2321640253067017, "learning_rate": 9.992268637005268e-06, "loss": 0.5277, "step": 1740 }, { "epoch": 0.35, "grad_norm": 1.2367174625396729, "learning_rate": 9.991608310779762e-06, "loss": 0.4691, "step": 1750 }, { "epoch": 0.352, "grad_norm": 1.744498372077942, "learning_rate": 9.990920952907005e-06, "loss": 0.497, "step": 1760 }, { "epoch": 0.354, "grad_norm": 1.5234036445617676, "learning_rate": 9.99020656710932e-06, "loss": 0.4469, "step": 1770 }, { "epoch": 0.356, "grad_norm": 1.0841994285583496, "learning_rate": 9.989465157255413e-06, "loss": 0.4317, "step": 1780 }, { "epoch": 0.358, "grad_norm": 1.1552493572235107, "learning_rate": 9.988696727360323e-06, "loss": 0.461, "step": 1790 }, { "epoch": 0.36, "grad_norm": 1.6440882682800293, "learning_rate": 9.987901281585423e-06, "loss": 0.5038, "step": 1800 }, { "epoch": 0.362, "grad_norm": 1.3549693822860718, "learning_rate": 9.987078824238384e-06, "loss": 0.468, "step": 1810 }, { "epoch": 0.364, "grad_norm": 1.3959366083145142, "learning_rate": 9.986229359773154e-06, "loss": 0.4709, "step": 1820 }, { "epoch": 0.366, "grad_norm": 1.612743854522705, "learning_rate": 9.985352892789941e-06, "loss": 0.4717, "step": 1830 }, { "epoch": 0.368, "grad_norm": 1.2929461002349854, "learning_rate": 9.98444942803518e-06, "loss": 0.4811, "step": 1840 }, { "epoch": 0.37, "grad_norm": 1.450260877609253, "learning_rate": 9.983518970401508e-06, "loss": 0.5203, "step": 1850 }, { "epoch": 0.372, "grad_norm": 1.1035138368606567, "learning_rate": 9.982561524927749e-06, "loss": 0.4759, "step": 1860 }, { "epoch": 0.374, "grad_norm": 1.2472127676010132, "learning_rate": 9.981577096798864e-06, "loss": 0.5506, "step": 1870 }, { "epoch": 0.376, "grad_norm": 1.1840726137161255, "learning_rate": 9.980565691345945e-06, "loss": 0.4878, "step": 1880 }, { "epoch": 0.378, "grad_norm": 1.3959659337997437, "learning_rate": 9.979527314046177e-06, "loss": 0.4916, "step": 1890 }, { "epoch": 0.38, "grad_norm": 1.154722809791565, "learning_rate": 9.978461970522807e-06, "loss": 0.4592, "step": 1900 }, { "epoch": 0.382, "grad_norm": 1.2845226526260376, "learning_rate": 9.977369666545114e-06, "loss": 0.5465, "step": 1910 }, { "epoch": 0.384, "grad_norm": 1.175721287727356, "learning_rate": 9.976250408028383e-06, "loss": 0.4742, "step": 1920 }, { "epoch": 0.386, "grad_norm": 1.5375049114227295, "learning_rate": 9.975104201033868e-06, "loss": 0.4783, "step": 1930 }, { "epoch": 0.388, "grad_norm": 1.305464506149292, "learning_rate": 9.973931051768756e-06, "loss": 0.4931, "step": 1940 }, { "epoch": 0.39, "grad_norm": 1.2971112728118896, "learning_rate": 9.972730966586144e-06, "loss": 0.4617, "step": 1950 }, { "epoch": 0.392, "grad_norm": 1.2412433624267578, "learning_rate": 9.971503951984996e-06, "loss": 0.5366, "step": 1960 }, { "epoch": 0.394, "grad_norm": 1.3253568410873413, "learning_rate": 9.970250014610105e-06, "loss": 0.5186, "step": 1970 }, { "epoch": 0.396, "grad_norm": 1.2338709831237793, "learning_rate": 9.968969161252072e-06, "loss": 0.5074, "step": 1980 }, { "epoch": 0.398, "grad_norm": 1.0964592695236206, "learning_rate": 9.96766139884725e-06, "loss": 0.4096, "step": 1990 }, { "epoch": 0.4, "grad_norm": 1.8194420337677002, "learning_rate": 9.966326734477721e-06, "loss": 0.4439, "step": 2000 }, { "epoch": 0.402, "grad_norm": 1.0826818943023682, "learning_rate": 9.96496517537125e-06, "loss": 0.4602, "step": 2010 }, { "epoch": 0.404, "grad_norm": 0.9985308647155762, "learning_rate": 9.96357672890125e-06, "loss": 0.429, "step": 2020 }, { "epoch": 0.406, "grad_norm": 1.4715704917907715, "learning_rate": 9.96216140258674e-06, "loss": 0.5181, "step": 2030 }, { "epoch": 0.408, "grad_norm": 1.606497049331665, "learning_rate": 9.9607192040923e-06, "loss": 0.4314, "step": 2040 }, { "epoch": 0.41, "grad_norm": 1.4677903652191162, "learning_rate": 9.959250141228046e-06, "loss": 0.4204, "step": 2050 }, { "epoch": 0.412, "grad_norm": 1.4426929950714111, "learning_rate": 9.95775422194956e-06, "loss": 0.4803, "step": 2060 }, { "epoch": 0.414, "grad_norm": 1.2194904088974, "learning_rate": 9.956231454357876e-06, "loss": 0.4695, "step": 2070 }, { "epoch": 0.416, "grad_norm": 1.1238853931427002, "learning_rate": 9.954681846699414e-06, "loss": 0.5755, "step": 2080 }, { "epoch": 0.418, "grad_norm": 1.0031379461288452, "learning_rate": 9.953105407365952e-06, "loss": 0.4325, "step": 2090 }, { "epoch": 0.42, "grad_norm": 1.6501160860061646, "learning_rate": 9.951502144894566e-06, "loss": 0.5403, "step": 2100 }, { "epoch": 0.422, "grad_norm": 1.440741777420044, "learning_rate": 9.94987206796759e-06, "loss": 0.4725, "step": 2110 }, { "epoch": 0.424, "grad_norm": 1.2382850646972656, "learning_rate": 9.948215185412578e-06, "loss": 0.4235, "step": 2120 }, { "epoch": 0.426, "grad_norm": 1.3473109006881714, "learning_rate": 9.94653150620224e-06, "loss": 0.4388, "step": 2130 }, { "epoch": 0.428, "grad_norm": 1.2151130437850952, "learning_rate": 9.944821039454403e-06, "loss": 0.5012, "step": 2140 }, { "epoch": 0.43, "grad_norm": 1.1826070547103882, "learning_rate": 9.943083794431959e-06, "loss": 0.486, "step": 2150 }, { "epoch": 0.432, "grad_norm": 1.0905817747116089, "learning_rate": 9.941319780542817e-06, "loss": 0.5423, "step": 2160 }, { "epoch": 0.434, "grad_norm": 1.3496341705322266, "learning_rate": 9.939529007339852e-06, "loss": 0.5141, "step": 2170 }, { "epoch": 0.436, "grad_norm": 1.33112633228302, "learning_rate": 9.937711484520848e-06, "loss": 0.4819, "step": 2180 }, { "epoch": 0.438, "grad_norm": 1.1748610734939575, "learning_rate": 9.935867221928454e-06, "loss": 0.5097, "step": 2190 }, { "epoch": 0.44, "grad_norm": 1.2363240718841553, "learning_rate": 9.93399622955012e-06, "loss": 0.4261, "step": 2200 }, { "epoch": 0.442, "grad_norm": 1.667703628540039, "learning_rate": 9.932098517518056e-06, "loss": 0.4662, "step": 2210 }, { "epoch": 0.444, "grad_norm": 1.446291446685791, "learning_rate": 9.930174096109163e-06, "loss": 0.445, "step": 2220 }, { "epoch": 0.446, "grad_norm": 1.476731300354004, "learning_rate": 9.928222975744992e-06, "loss": 0.4666, "step": 2230 }, { "epoch": 0.448, "grad_norm": 1.3054319620132446, "learning_rate": 9.926245166991671e-06, "loss": 0.4248, "step": 2240 }, { "epoch": 0.45, "grad_norm": 1.5817550420761108, "learning_rate": 9.924240680559867e-06, "loss": 0.4325, "step": 2250 }, { "epoch": 0.452, "grad_norm": 1.0748894214630127, "learning_rate": 9.922209527304709e-06, "loss": 0.4862, "step": 2260 }, { "epoch": 0.454, "grad_norm": 1.6298894882202148, "learning_rate": 9.920151718225743e-06, "loss": 0.542, "step": 2270 }, { "epoch": 0.456, "grad_norm": 1.4869043827056885, "learning_rate": 9.918067264466867e-06, "loss": 0.4627, "step": 2280 }, { "epoch": 0.458, "grad_norm": 1.3258094787597656, "learning_rate": 9.915956177316269e-06, "loss": 0.4389, "step": 2290 }, { "epoch": 0.46, "grad_norm": 1.586165189743042, "learning_rate": 9.913818468206368e-06, "loss": 0.6127, "step": 2300 }, { "epoch": 0.462, "grad_norm": 1.3697926998138428, "learning_rate": 9.911654148713757e-06, "loss": 0.574, "step": 2310 }, { "epoch": 0.464, "grad_norm": 1.212844729423523, "learning_rate": 9.909463230559127e-06, "loss": 0.5196, "step": 2320 }, { "epoch": 0.466, "grad_norm": 0.9597765207290649, "learning_rate": 9.907245725607217e-06, "loss": 0.5269, "step": 2330 }, { "epoch": 0.468, "grad_norm": 1.3319644927978516, "learning_rate": 9.905001645866746e-06, "loss": 0.4136, "step": 2340 }, { "epoch": 0.47, "grad_norm": 1.0987292528152466, "learning_rate": 9.902731003490344e-06, "loss": 0.4325, "step": 2350 }, { "epoch": 0.472, "grad_norm": 1.3473751544952393, "learning_rate": 9.90043381077449e-06, "loss": 0.464, "step": 2360 }, { "epoch": 0.474, "grad_norm": 1.5998402833938599, "learning_rate": 9.898110080159442e-06, "loss": 0.6172, "step": 2370 }, { "epoch": 0.476, "grad_norm": 0.9513059258460999, "learning_rate": 9.895759824229176e-06, "loss": 0.4431, "step": 2380 }, { "epoch": 0.478, "grad_norm": 1.3348459005355835, "learning_rate": 9.893383055711308e-06, "loss": 0.4383, "step": 2390 }, { "epoch": 0.48, "grad_norm": 1.3876981735229492, "learning_rate": 9.890979787477036e-06, "loss": 0.5066, "step": 2400 }, { "epoch": 0.482, "grad_norm": 1.3132597208023071, "learning_rate": 9.88855003254106e-06, "loss": 0.4995, "step": 2410 }, { "epoch": 0.484, "grad_norm": 1.3520206212997437, "learning_rate": 9.886093804061523e-06, "loss": 0.4658, "step": 2420 }, { "epoch": 0.486, "grad_norm": 1.4986363649368286, "learning_rate": 9.883611115339929e-06, "loss": 0.4718, "step": 2430 }, { "epoch": 0.488, "grad_norm": 1.3672336339950562, "learning_rate": 9.881101979821075e-06, "loss": 0.5289, "step": 2440 }, { "epoch": 0.49, "grad_norm": 1.610236406326294, "learning_rate": 9.87856641109298e-06, "loss": 0.5065, "step": 2450 }, { "epoch": 0.492, "grad_norm": 1.241811752319336, "learning_rate": 9.876004422886809e-06, "loss": 0.3937, "step": 2460 }, { "epoch": 0.494, "grad_norm": 1.385197639465332, "learning_rate": 9.873416029076801e-06, "loss": 0.5553, "step": 2470 }, { "epoch": 0.496, "grad_norm": 1.2933454513549805, "learning_rate": 9.870801243680191e-06, "loss": 0.4653, "step": 2480 }, { "epoch": 0.498, "grad_norm": 1.2639875411987305, "learning_rate": 9.868160080857134e-06, "loss": 0.4908, "step": 2490 }, { "epoch": 0.5, "grad_norm": 1.142906665802002, "learning_rate": 9.865492554910634e-06, "loss": 0.4884, "step": 2500 }, { "epoch": 0.502, "grad_norm": 1.0850920677185059, "learning_rate": 9.862798680286459e-06, "loss": 0.5224, "step": 2510 }, { "epoch": 0.504, "grad_norm": 1.3904608488082886, "learning_rate": 9.860078471573066e-06, "loss": 0.5273, "step": 2520 }, { "epoch": 0.506, "grad_norm": 1.1105901002883911, "learning_rate": 9.857331943501527e-06, "loss": 0.4478, "step": 2530 }, { "epoch": 0.508, "grad_norm": 1.278568983078003, "learning_rate": 9.854559110945436e-06, "loss": 0.5204, "step": 2540 }, { "epoch": 0.51, "grad_norm": 0.9616843461990356, "learning_rate": 9.851759988920843e-06, "loss": 0.452, "step": 2550 }, { "epoch": 0.512, "grad_norm": 1.7013401985168457, "learning_rate": 9.848934592586165e-06, "loss": 0.5092, "step": 2560 }, { "epoch": 0.514, "grad_norm": 1.2884467840194702, "learning_rate": 9.846082937242108e-06, "loss": 0.4333, "step": 2570 }, { "epoch": 0.516, "grad_norm": 1.0977705717086792, "learning_rate": 9.843205038331574e-06, "loss": 0.4188, "step": 2580 }, { "epoch": 0.518, "grad_norm": 1.1589962244033813, "learning_rate": 9.84030091143959e-06, "loss": 0.5456, "step": 2590 }, { "epoch": 0.52, "grad_norm": 1.4452130794525146, "learning_rate": 9.837370572293221e-06, "loss": 0.538, "step": 2600 }, { "epoch": 0.522, "grad_norm": 0.8810553550720215, "learning_rate": 9.834414036761477e-06, "loss": 0.4697, "step": 2610 }, { "epoch": 0.524, "grad_norm": 1.2297122478485107, "learning_rate": 9.831431320855235e-06, "loss": 0.4525, "step": 2620 }, { "epoch": 0.526, "grad_norm": 1.4245611429214478, "learning_rate": 9.828422440727152e-06, "loss": 0.5304, "step": 2630 }, { "epoch": 0.528, "grad_norm": 1.4869318008422852, "learning_rate": 9.82538741267157e-06, "loss": 0.5631, "step": 2640 }, { "epoch": 0.53, "grad_norm": 1.0916733741760254, "learning_rate": 9.822326253124436e-06, "loss": 0.4249, "step": 2650 }, { "epoch": 0.532, "grad_norm": 1.3281223773956299, "learning_rate": 9.819238978663212e-06, "loss": 0.4827, "step": 2660 }, { "epoch": 0.534, "grad_norm": 1.1685035228729248, "learning_rate": 9.816125606006777e-06, "loss": 0.547, "step": 2670 }, { "epoch": 0.536, "grad_norm": 1.309923768043518, "learning_rate": 9.812986152015349e-06, "loss": 0.4927, "step": 2680 }, { "epoch": 0.538, "grad_norm": 1.3431212902069092, "learning_rate": 9.809820633690383e-06, "loss": 0.5215, "step": 2690 }, { "epoch": 0.54, "grad_norm": 1.6138921976089478, "learning_rate": 9.806629068174486e-06, "loss": 0.4668, "step": 2700 }, { "epoch": 0.542, "grad_norm": 1.1583821773529053, "learning_rate": 9.803411472751321e-06, "loss": 0.4882, "step": 2710 }, { "epoch": 0.544, "grad_norm": 1.1183314323425293, "learning_rate": 9.800167864845513e-06, "loss": 0.5254, "step": 2720 }, { "epoch": 0.546, "grad_norm": 1.4351534843444824, "learning_rate": 9.796898262022555e-06, "loss": 0.5631, "step": 2730 }, { "epoch": 0.548, "grad_norm": 1.114702582359314, "learning_rate": 9.793602681988714e-06, "loss": 0.5087, "step": 2740 }, { "epoch": 0.55, "grad_norm": 1.6921616792678833, "learning_rate": 9.790281142590937e-06, "loss": 0.5482, "step": 2750 }, { "epoch": 0.552, "grad_norm": 1.3350164890289307, "learning_rate": 9.786933661816747e-06, "loss": 0.5148, "step": 2760 }, { "epoch": 0.554, "grad_norm": 1.3733949661254883, "learning_rate": 9.783560257794153e-06, "loss": 0.4336, "step": 2770 }, { "epoch": 0.556, "grad_norm": 1.232109546661377, "learning_rate": 9.78016094879155e-06, "loss": 0.4304, "step": 2780 }, { "epoch": 0.558, "grad_norm": 1.4043456315994263, "learning_rate": 9.776735753217618e-06, "loss": 0.4891, "step": 2790 }, { "epoch": 0.56, "grad_norm": 1.1592031717300415, "learning_rate": 9.773284689621223e-06, "loss": 0.4892, "step": 2800 }, { "epoch": 0.562, "grad_norm": 1.1605182886123657, "learning_rate": 9.76980777669132e-06, "loss": 0.4433, "step": 2810 }, { "epoch": 0.564, "grad_norm": 1.1218903064727783, "learning_rate": 9.766305033256847e-06, "loss": 0.5072, "step": 2820 }, { "epoch": 0.566, "grad_norm": 1.4614591598510742, "learning_rate": 9.762776478286622e-06, "loss": 0.4988, "step": 2830 }, { "epoch": 0.568, "grad_norm": 1.0388537645339966, "learning_rate": 9.75922213088925e-06, "loss": 0.4314, "step": 2840 }, { "epoch": 0.57, "grad_norm": 1.3844202756881714, "learning_rate": 9.755642010313007e-06, "loss": 0.4516, "step": 2850 }, { "epoch": 0.572, "grad_norm": 1.5695295333862305, "learning_rate": 9.752036135945743e-06, "loss": 0.4761, "step": 2860 }, { "epoch": 0.574, "grad_norm": 1.3042584657669067, "learning_rate": 9.748404527314782e-06, "loss": 0.4851, "step": 2870 }, { "epoch": 0.576, "grad_norm": 1.3485811948776245, "learning_rate": 9.744747204086795e-06, "loss": 0.4127, "step": 2880 }, { "epoch": 0.578, "grad_norm": 1.074900507926941, "learning_rate": 9.741064186067723e-06, "loss": 0.4803, "step": 2890 }, { "epoch": 0.58, "grad_norm": 1.3059568405151367, "learning_rate": 9.73735549320265e-06, "loss": 0.4279, "step": 2900 }, { "epoch": 0.582, "grad_norm": 1.1361761093139648, "learning_rate": 9.733621145575697e-06, "loss": 0.5364, "step": 2910 }, { "epoch": 0.584, "grad_norm": 1.4308760166168213, "learning_rate": 9.72986116340992e-06, "loss": 0.5978, "step": 2920 }, { "epoch": 0.586, "grad_norm": 0.9736759066581726, "learning_rate": 9.726075567067193e-06, "loss": 0.4471, "step": 2930 }, { "epoch": 0.588, "grad_norm": 1.555156946182251, "learning_rate": 9.722264377048105e-06, "loss": 0.526, "step": 2940 }, { "epoch": 0.59, "grad_norm": 1.1218699216842651, "learning_rate": 9.718427613991848e-06, "loss": 0.4025, "step": 2950 }, { "epoch": 0.592, "grad_norm": 1.1059517860412598, "learning_rate": 9.714565298676093e-06, "loss": 0.4503, "step": 2960 }, { "epoch": 0.594, "grad_norm": 1.3590087890625, "learning_rate": 9.710677452016898e-06, "loss": 0.5018, "step": 2970 }, { "epoch": 0.596, "grad_norm": 0.9605317115783691, "learning_rate": 9.706764095068579e-06, "loss": 0.474, "step": 2980 }, { "epoch": 0.598, "grad_norm": 0.9634230136871338, "learning_rate": 9.702825249023597e-06, "loss": 0.4051, "step": 2990 }, { "epoch": 0.6, "grad_norm": 1.0012608766555786, "learning_rate": 9.698860935212455e-06, "loss": 0.5104, "step": 3000 }, { "epoch": 0.602, "grad_norm": 1.254647970199585, "learning_rate": 9.69487117510357e-06, "loss": 0.4357, "step": 3010 }, { "epoch": 0.604, "grad_norm": 1.2579063177108765, "learning_rate": 9.69085599030316e-06, "loss": 0.4413, "step": 3020 }, { "epoch": 0.606, "grad_norm": 1.1054672002792358, "learning_rate": 9.686815402555133e-06, "loss": 0.6412, "step": 3030 }, { "epoch": 0.608, "grad_norm": 1.3997899293899536, "learning_rate": 9.682749433740963e-06, "loss": 0.4615, "step": 3040 }, { "epoch": 0.61, "grad_norm": 1.025417685508728, "learning_rate": 9.678658105879568e-06, "loss": 0.4214, "step": 3050 }, { "epoch": 0.612, "grad_norm": 1.0472135543823242, "learning_rate": 9.674541441127202e-06, "loss": 0.5116, "step": 3060 }, { "epoch": 0.614, "grad_norm": 1.078364372253418, "learning_rate": 9.670399461777328e-06, "loss": 0.4569, "step": 3070 }, { "epoch": 0.616, "grad_norm": 1.3852829933166504, "learning_rate": 9.666232190260496e-06, "loss": 0.53, "step": 3080 }, { "epoch": 0.618, "grad_norm": 1.3731964826583862, "learning_rate": 9.662039649144224e-06, "loss": 0.4696, "step": 3090 }, { "epoch": 0.62, "grad_norm": 1.291717290878296, "learning_rate": 9.65782186113288e-06, "loss": 0.4263, "step": 3100 }, { "epoch": 0.622, "grad_norm": 1.3405389785766602, "learning_rate": 9.653578849067542e-06, "loss": 0.4435, "step": 3110 }, { "epoch": 0.624, "grad_norm": 1.226496696472168, "learning_rate": 9.649310635925904e-06, "loss": 0.4492, "step": 3120 }, { "epoch": 0.626, "grad_norm": 1.15779447555542, "learning_rate": 9.645017244822124e-06, "loss": 0.5387, "step": 3130 }, { "epoch": 0.628, "grad_norm": 1.0545017719268799, "learning_rate": 9.640698699006708e-06, "loss": 0.4616, "step": 3140 }, { "epoch": 0.63, "grad_norm": 1.0854800939559937, "learning_rate": 9.636355021866388e-06, "loss": 0.518, "step": 3150 }, { "epoch": 0.632, "grad_norm": 1.292471170425415, "learning_rate": 9.631986236923998e-06, "loss": 0.4888, "step": 3160 }, { "epoch": 0.634, "grad_norm": 1.2289882898330688, "learning_rate": 9.62759236783833e-06, "loss": 0.5193, "step": 3170 }, { "epoch": 0.636, "grad_norm": 1.4421725273132324, "learning_rate": 9.623173438404027e-06, "loss": 0.5314, "step": 3180 }, { "epoch": 0.638, "grad_norm": 1.4118566513061523, "learning_rate": 9.61872947255144e-06, "loss": 0.5525, "step": 3190 }, { "epoch": 0.64, "grad_norm": 1.1490426063537598, "learning_rate": 9.614260494346505e-06, "loss": 0.4832, "step": 3200 }, { "epoch": 0.642, "grad_norm": 1.3505995273590088, "learning_rate": 9.609766527990604e-06, "loss": 0.4742, "step": 3210 }, { "epoch": 0.644, "grad_norm": 1.4278666973114014, "learning_rate": 9.605247597820448e-06, "loss": 0.458, "step": 3220 }, { "epoch": 0.646, "grad_norm": 1.5824147462844849, "learning_rate": 9.600703728307935e-06, "loss": 0.4327, "step": 3230 }, { "epoch": 0.648, "grad_norm": 1.1965423822402954, "learning_rate": 9.596134944060018e-06, "loss": 0.4241, "step": 3240 }, { "epoch": 0.65, "grad_norm": 1.3452823162078857, "learning_rate": 9.591541269818574e-06, "loss": 0.4312, "step": 3250 }, { "epoch": 0.652, "grad_norm": 1.6801984310150146, "learning_rate": 9.586922730460273e-06, "loss": 0.4858, "step": 3260 }, { "epoch": 0.654, "grad_norm": 1.521422266960144, "learning_rate": 9.582279350996437e-06, "loss": 0.4886, "step": 3270 }, { "epoch": 0.656, "grad_norm": 1.0848934650421143, "learning_rate": 9.577611156572908e-06, "loss": 0.5233, "step": 3280 }, { "epoch": 0.658, "grad_norm": 1.131194829940796, "learning_rate": 9.572918172469912e-06, "loss": 0.5093, "step": 3290 }, { "epoch": 0.66, "grad_norm": 0.7532334923744202, "learning_rate": 9.568200424101918e-06, "loss": 0.4439, "step": 3300 }, { "epoch": 0.662, "grad_norm": 0.992387056350708, "learning_rate": 9.563457937017514e-06, "loss": 0.4405, "step": 3310 }, { "epoch": 0.664, "grad_norm": 1.5561617612838745, "learning_rate": 9.558690736899248e-06, "loss": 0.5558, "step": 3320 }, { "epoch": 0.666, "grad_norm": 0.9538534283638, "learning_rate": 9.553898849563502e-06, "loss": 0.4767, "step": 3330 }, { "epoch": 0.668, "grad_norm": 1.5033385753631592, "learning_rate": 9.549082300960351e-06, "loss": 0.4721, "step": 3340 }, { "epoch": 0.67, "grad_norm": 1.4979592561721802, "learning_rate": 9.544241117173422e-06, "loss": 0.4778, "step": 3350 }, { "epoch": 0.672, "grad_norm": 1.2602815628051758, "learning_rate": 9.539375324419748e-06, "loss": 0.4715, "step": 3360 }, { "epoch": 0.674, "grad_norm": 1.1525267362594604, "learning_rate": 9.534484949049636e-06, "loss": 0.5271, "step": 3370 }, { "epoch": 0.676, "grad_norm": 1.18638014793396, "learning_rate": 9.529570017546512e-06, "loss": 0.5016, "step": 3380 }, { "epoch": 0.678, "grad_norm": 1.3862594366073608, "learning_rate": 9.524630556526788e-06, "loss": 0.4872, "step": 3390 }, { "epoch": 0.68, "grad_norm": 1.5191760063171387, "learning_rate": 9.51966659273971e-06, "loss": 0.4688, "step": 3400 }, { "epoch": 0.682, "grad_norm": 1.176364541053772, "learning_rate": 9.514678153067218e-06, "loss": 0.4674, "step": 3410 }, { "epoch": 0.684, "grad_norm": 1.2833201885223389, "learning_rate": 9.509665264523803e-06, "loss": 0.4414, "step": 3420 }, { "epoch": 0.686, "grad_norm": 0.9362300634384155, "learning_rate": 9.504627954256352e-06, "loss": 0.368, "step": 3430 }, { "epoch": 0.688, "grad_norm": 1.2998939752578735, "learning_rate": 9.499566249544006e-06, "loss": 0.4505, "step": 3440 }, { "epoch": 0.69, "grad_norm": 0.7536768913269043, "learning_rate": 9.494480177798013e-06, "loss": 0.5055, "step": 3450 }, { "epoch": 0.692, "grad_norm": 1.1865419149398804, "learning_rate": 9.489369766561584e-06, "loss": 0.4671, "step": 3460 }, { "epoch": 0.694, "grad_norm": 1.128171443939209, "learning_rate": 9.48423504350973e-06, "loss": 0.4721, "step": 3470 }, { "epoch": 0.696, "grad_norm": 1.1129778623580933, "learning_rate": 9.479076036449125e-06, "loss": 0.5855, "step": 3480 }, { "epoch": 0.698, "grad_norm": 1.0635051727294922, "learning_rate": 9.473892773317952e-06, "loss": 0.4509, "step": 3490 }, { "epoch": 0.7, "grad_norm": 0.9211987257003784, "learning_rate": 9.468685282185745e-06, "loss": 0.4543, "step": 3500 }, { "epoch": 0.702, "grad_norm": 1.2037076950073242, "learning_rate": 9.463453591253253e-06, "loss": 0.4839, "step": 3510 }, { "epoch": 0.704, "grad_norm": 0.9373881220817566, "learning_rate": 9.458197728852268e-06, "loss": 0.4215, "step": 3520 }, { "epoch": 0.706, "grad_norm": 0.9381740093231201, "learning_rate": 9.452917723445484e-06, "loss": 0.4598, "step": 3530 }, { "epoch": 0.708, "grad_norm": 1.6144822835922241, "learning_rate": 9.447613603626337e-06, "loss": 0.5762, "step": 3540 }, { "epoch": 0.71, "grad_norm": 1.3809951543807983, "learning_rate": 9.44228539811886e-06, "loss": 0.4934, "step": 3550 }, { "epoch": 0.712, "grad_norm": 1.1491771936416626, "learning_rate": 9.436933135777511e-06, "loss": 0.4639, "step": 3560 }, { "epoch": 0.714, "grad_norm": 1.0632610321044922, "learning_rate": 9.431556845587029e-06, "loss": 0.4966, "step": 3570 }, { "epoch": 0.716, "grad_norm": 1.2312216758728027, "learning_rate": 9.426156556662276e-06, "loss": 0.4287, "step": 3580 }, { "epoch": 0.718, "grad_norm": 1.2142528295516968, "learning_rate": 9.420732298248077e-06, "loss": 0.4216, "step": 3590 }, { "epoch": 0.72, "grad_norm": 0.7389311194419861, "learning_rate": 9.41528409971906e-06, "loss": 0.4317, "step": 3600 }, { "epoch": 0.722, "grad_norm": 1.421856164932251, "learning_rate": 9.409811990579498e-06, "loss": 0.4798, "step": 3610 }, { "epoch": 0.724, "grad_norm": 1.5812768936157227, "learning_rate": 9.404316000463152e-06, "loss": 0.5848, "step": 3620 }, { "epoch": 0.726, "grad_norm": 0.9333869218826294, "learning_rate": 9.398796159133108e-06, "loss": 0.4232, "step": 3630 }, { "epoch": 0.728, "grad_norm": 1.2076997756958008, "learning_rate": 9.393252496481615e-06, "loss": 0.5026, "step": 3640 }, { "epoch": 0.73, "grad_norm": 1.4881305694580078, "learning_rate": 9.387685042529926e-06, "loss": 0.525, "step": 3650 }, { "epoch": 0.732, "grad_norm": 1.2889773845672607, "learning_rate": 9.382093827428135e-06, "loss": 0.4729, "step": 3660 }, { "epoch": 0.734, "grad_norm": 1.4050449132919312, "learning_rate": 9.376478881455008e-06, "loss": 0.4944, "step": 3670 }, { "epoch": 0.736, "grad_norm": 0.8713739514350891, "learning_rate": 9.370840235017829e-06, "loss": 0.4217, "step": 3680 }, { "epoch": 0.738, "grad_norm": 1.031435251235962, "learning_rate": 9.365177918652226e-06, "loss": 0.4611, "step": 3690 }, { "epoch": 0.74, "grad_norm": 0.8381909132003784, "learning_rate": 9.35949196302201e-06, "loss": 0.4212, "step": 3700 }, { "epoch": 0.742, "grad_norm": 1.1888693571090698, "learning_rate": 9.353782398919012e-06, "loss": 0.4543, "step": 3710 }, { "epoch": 0.744, "grad_norm": 1.4899085760116577, "learning_rate": 9.348049257262908e-06, "loss": 0.5147, "step": 3720 }, { "epoch": 0.746, "grad_norm": 1.2330344915390015, "learning_rate": 9.342292569101061e-06, "loss": 0.6145, "step": 3730 }, { "epoch": 0.748, "grad_norm": 1.0226585865020752, "learning_rate": 9.336512365608343e-06, "loss": 0.4879, "step": 3740 }, { "epoch": 0.75, "grad_norm": 1.1054939031600952, "learning_rate": 9.330708678086975e-06, "loss": 0.4206, "step": 3750 }, { "epoch": 0.752, "grad_norm": 1.4774991273880005, "learning_rate": 9.324881537966355e-06, "loss": 0.4093, "step": 3760 }, { "epoch": 0.754, "grad_norm": 1.1856415271759033, "learning_rate": 9.319030976802881e-06, "loss": 0.4614, "step": 3770 }, { "epoch": 0.756, "grad_norm": 1.3986643552780151, "learning_rate": 9.313157026279792e-06, "loss": 0.5174, "step": 3780 }, { "epoch": 0.758, "grad_norm": 1.2036128044128418, "learning_rate": 9.307259718206984e-06, "loss": 0.464, "step": 3790 }, { "epoch": 0.76, "grad_norm": 0.8177152276039124, "learning_rate": 9.301339084520853e-06, "loss": 0.4331, "step": 3800 }, { "epoch": 0.762, "grad_norm": 1.1826705932617188, "learning_rate": 9.295395157284103e-06, "loss": 0.4363, "step": 3810 }, { "epoch": 0.764, "grad_norm": 1.0908697843551636, "learning_rate": 9.289427968685588e-06, "loss": 0.4087, "step": 3820 }, { "epoch": 0.766, "grad_norm": 0.9723864793777466, "learning_rate": 9.28343755104013e-06, "loss": 0.3952, "step": 3830 }, { "epoch": 0.768, "grad_norm": 1.1800845861434937, "learning_rate": 9.277423936788348e-06, "loss": 0.4966, "step": 3840 }, { "epoch": 0.77, "grad_norm": 1.5162053108215332, "learning_rate": 9.271387158496477e-06, "loss": 0.5525, "step": 3850 }, { "epoch": 0.772, "grad_norm": 1.1737457513809204, "learning_rate": 9.265327248856198e-06, "loss": 0.4374, "step": 3860 }, { "epoch": 0.774, "grad_norm": 1.1317731142044067, "learning_rate": 9.259244240684457e-06, "loss": 0.4321, "step": 3870 }, { "epoch": 0.776, "grad_norm": 1.3863508701324463, "learning_rate": 9.25313816692329e-06, "loss": 0.5441, "step": 3880 }, { "epoch": 0.778, "grad_norm": 1.0942466259002686, "learning_rate": 9.247009060639637e-06, "loss": 0.5124, "step": 3890 }, { "epoch": 0.78, "grad_norm": 1.3743023872375488, "learning_rate": 9.240856955025175e-06, "loss": 0.4357, "step": 3900 }, { "epoch": 0.782, "grad_norm": 1.1339491605758667, "learning_rate": 9.234681883396129e-06, "loss": 0.5066, "step": 3910 }, { "epoch": 0.784, "grad_norm": 1.9541819095611572, "learning_rate": 9.228483879193096e-06, "loss": 0.4864, "step": 3920 }, { "epoch": 0.786, "grad_norm": 1.257652759552002, "learning_rate": 9.22226297598086e-06, "loss": 0.4361, "step": 3930 }, { "epoch": 0.788, "grad_norm": 1.4581412076950073, "learning_rate": 9.216019207448216e-06, "loss": 0.5005, "step": 3940 }, { "epoch": 0.79, "grad_norm": 1.6676081418991089, "learning_rate": 9.209752607407784e-06, "loss": 0.4307, "step": 3950 }, { "epoch": 0.792, "grad_norm": 0.898801863193512, "learning_rate": 9.203463209795822e-06, "loss": 0.487, "step": 3960 }, { "epoch": 0.794, "grad_norm": 1.6811095476150513, "learning_rate": 9.197151048672051e-06, "loss": 0.5448, "step": 3970 }, { "epoch": 0.796, "grad_norm": 1.222164511680603, "learning_rate": 9.190816158219462e-06, "loss": 0.4603, "step": 3980 }, { "epoch": 0.798, "grad_norm": 1.161139726638794, "learning_rate": 9.18445857274414e-06, "loss": 0.5216, "step": 3990 }, { "epoch": 0.8, "grad_norm": 1.3466241359710693, "learning_rate": 9.178078326675069e-06, "loss": 0.5273, "step": 4000 }, { "epoch": 0.802, "grad_norm": 1.1372344493865967, "learning_rate": 9.171675454563949e-06, "loss": 0.4842, "step": 4010 }, { "epoch": 0.804, "grad_norm": 1.10002601146698, "learning_rate": 9.165249991085012e-06, "loss": 0.5429, "step": 4020 }, { "epoch": 0.806, "grad_norm": 0.8916090726852417, "learning_rate": 9.158801971034832e-06, "loss": 0.5703, "step": 4030 }, { "epoch": 0.808, "grad_norm": 1.131230115890503, "learning_rate": 9.152331429332136e-06, "loss": 0.4555, "step": 4040 }, { "epoch": 0.81, "grad_norm": 0.9842562079429626, "learning_rate": 9.14583840101761e-06, "loss": 0.4475, "step": 4050 }, { "epoch": 0.812, "grad_norm": 1.0680824518203735, "learning_rate": 9.139322921253724e-06, "loss": 0.4471, "step": 4060 }, { "epoch": 0.814, "grad_norm": 1.2112277746200562, "learning_rate": 9.132785025324524e-06, "loss": 0.5342, "step": 4070 }, { "epoch": 0.816, "grad_norm": 1.359027624130249, "learning_rate": 9.12622474863545e-06, "loss": 0.5038, "step": 4080 }, { "epoch": 0.818, "grad_norm": 1.2182432413101196, "learning_rate": 9.119642126713147e-06, "loss": 0.4761, "step": 4090 }, { "epoch": 0.82, "grad_norm": 0.905654788017273, "learning_rate": 9.113037195205267e-06, "loss": 0.4434, "step": 4100 }, { "epoch": 0.822, "grad_norm": 1.273230791091919, "learning_rate": 9.106409989880274e-06, "loss": 0.4119, "step": 4110 }, { "epoch": 0.824, "grad_norm": 1.1936393976211548, "learning_rate": 9.099760546627262e-06, "loss": 0.5159, "step": 4120 }, { "epoch": 0.826, "grad_norm": 1.196582555770874, "learning_rate": 9.093088901455746e-06, "loss": 0.4788, "step": 4130 }, { "epoch": 0.828, "grad_norm": 1.420160174369812, "learning_rate": 9.086395090495475e-06, "loss": 0.4298, "step": 4140 }, { "epoch": 0.83, "grad_norm": 1.2535393238067627, "learning_rate": 9.079679149996235e-06, "loss": 0.5149, "step": 4150 }, { "epoch": 0.832, "grad_norm": 0.9435884356498718, "learning_rate": 9.072941116327654e-06, "loss": 0.4495, "step": 4160 }, { "epoch": 0.834, "grad_norm": 1.1288723945617676, "learning_rate": 9.066181025979006e-06, "loss": 0.4988, "step": 4170 }, { "epoch": 0.836, "grad_norm": 1.53179132938385, "learning_rate": 9.059398915559005e-06, "loss": 0.4834, "step": 4180 }, { "epoch": 0.838, "grad_norm": 1.48374342918396, "learning_rate": 9.052594821795616e-06, "loss": 0.5016, "step": 4190 }, { "epoch": 0.84, "grad_norm": 1.3857916593551636, "learning_rate": 9.045768781535857e-06, "loss": 0.5277, "step": 4200 }, { "epoch": 0.842, "grad_norm": 1.1805408000946045, "learning_rate": 9.038920831745587e-06, "loss": 0.432, "step": 4210 }, { "epoch": 0.844, "grad_norm": 1.1315293312072754, "learning_rate": 9.032051009509324e-06, "loss": 0.4377, "step": 4220 }, { "epoch": 0.846, "grad_norm": 0.9386146068572998, "learning_rate": 9.025159352030024e-06, "loss": 0.4993, "step": 4230 }, { "epoch": 0.848, "grad_norm": 1.187091588973999, "learning_rate": 9.0182458966289e-06, "loss": 0.5185, "step": 4240 }, { "epoch": 0.85, "grad_norm": 1.0254029035568237, "learning_rate": 9.011310680745203e-06, "loss": 0.471, "step": 4250 }, { "epoch": 0.852, "grad_norm": 1.1841859817504883, "learning_rate": 9.004353741936028e-06, "loss": 0.4047, "step": 4260 }, { "epoch": 0.854, "grad_norm": 1.25779390335083, "learning_rate": 8.99737511787611e-06, "loss": 0.4827, "step": 4270 }, { "epoch": 0.856, "grad_norm": 1.2962913513183594, "learning_rate": 8.990374846357616e-06, "loss": 0.4792, "step": 4280 }, { "epoch": 0.858, "grad_norm": 1.1236214637756348, "learning_rate": 8.98335296528995e-06, "loss": 0.5092, "step": 4290 }, { "epoch": 0.86, "grad_norm": 1.0570876598358154, "learning_rate": 8.97630951269953e-06, "loss": 0.4761, "step": 4300 }, { "epoch": 0.862, "grad_norm": 1.0793334245681763, "learning_rate": 8.969244526729599e-06, "loss": 0.425, "step": 4310 }, { "epoch": 0.864, "grad_norm": 1.0466865301132202, "learning_rate": 8.962158045640014e-06, "loss": 0.4867, "step": 4320 }, { "epoch": 0.866, "grad_norm": 1.8099735975265503, "learning_rate": 8.955050107807035e-06, "loss": 0.453, "step": 4330 }, { "epoch": 0.868, "grad_norm": 1.0396076440811157, "learning_rate": 8.947920751723119e-06, "loss": 0.4536, "step": 4340 }, { "epoch": 0.87, "grad_norm": 1.1914063692092896, "learning_rate": 8.940770015996707e-06, "loss": 0.448, "step": 4350 }, { "epoch": 0.872, "grad_norm": 1.0686973333358765, "learning_rate": 8.933597939352031e-06, "loss": 0.4932, "step": 4360 }, { "epoch": 0.874, "grad_norm": 0.9669390320777893, "learning_rate": 8.926404560628882e-06, "loss": 0.4812, "step": 4370 }, { "epoch": 0.876, "grad_norm": 1.4926239252090454, "learning_rate": 8.919189918782419e-06, "loss": 0.4337, "step": 4380 }, { "epoch": 0.878, "grad_norm": 1.3908580541610718, "learning_rate": 8.911954052882941e-06, "loss": 0.4595, "step": 4390 }, { "epoch": 0.88, "grad_norm": 1.088005781173706, "learning_rate": 8.904697002115693e-06, "loss": 0.4685, "step": 4400 }, { "epoch": 0.882, "grad_norm": 1.2951703071594238, "learning_rate": 8.89741880578064e-06, "loss": 0.5138, "step": 4410 }, { "epoch": 0.884, "grad_norm": 1.3422932624816895, "learning_rate": 8.890119503292258e-06, "loss": 0.5339, "step": 4420 }, { "epoch": 0.886, "grad_norm": 1.6533243656158447, "learning_rate": 8.882799134179326e-06, "loss": 0.4854, "step": 4430 }, { "epoch": 0.888, "grad_norm": 1.5386734008789062, "learning_rate": 8.875457738084706e-06, "loss": 0.4791, "step": 4440 }, { "epoch": 0.89, "grad_norm": 1.3411436080932617, "learning_rate": 8.868095354765125e-06, "loss": 0.4922, "step": 4450 }, { "epoch": 0.892, "grad_norm": 0.9799019694328308, "learning_rate": 8.860712024090971e-06, "loss": 0.4265, "step": 4460 }, { "epoch": 0.894, "grad_norm": 1.7590001821517944, "learning_rate": 8.853307786046073e-06, "loss": 0.5033, "step": 4470 }, { "epoch": 0.896, "grad_norm": 1.1440484523773193, "learning_rate": 8.84588268072747e-06, "loss": 0.5036, "step": 4480 }, { "epoch": 0.898, "grad_norm": 1.0841706991195679, "learning_rate": 8.838436748345217e-06, "loss": 0.5344, "step": 4490 }, { "epoch": 0.9, "grad_norm": 1.1884775161743164, "learning_rate": 8.830970029222152e-06, "loss": 0.3504, "step": 4500 }, { "epoch": 0.902, "grad_norm": 0.9909444451332092, "learning_rate": 8.823482563793687e-06, "loss": 0.4838, "step": 4510 }, { "epoch": 0.904, "grad_norm": 1.079696774482727, "learning_rate": 8.815974392607573e-06, "loss": 0.4601, "step": 4520 }, { "epoch": 0.906, "grad_norm": 1.1453964710235596, "learning_rate": 8.808445556323703e-06, "loss": 0.4623, "step": 4530 }, { "epoch": 0.908, "grad_norm": 1.1338945627212524, "learning_rate": 8.80089609571387e-06, "loss": 0.5288, "step": 4540 }, { "epoch": 0.91, "grad_norm": 1.3115707635879517, "learning_rate": 8.79332605166157e-06, "loss": 0.5267, "step": 4550 }, { "epoch": 0.912, "grad_norm": 1.3860828876495361, "learning_rate": 8.785735465161752e-06, "loss": 0.4985, "step": 4560 }, { "epoch": 0.914, "grad_norm": 1.2067689895629883, "learning_rate": 8.778124377320619e-06, "loss": 0.4699, "step": 4570 }, { "epoch": 0.916, "grad_norm": 1.4744197130203247, "learning_rate": 8.770492829355395e-06, "loss": 0.4483, "step": 4580 }, { "epoch": 0.918, "grad_norm": 1.1251822710037231, "learning_rate": 8.762840862594106e-06, "loss": 0.4664, "step": 4590 }, { "epoch": 0.92, "grad_norm": 0.8830546736717224, "learning_rate": 8.755168518475351e-06, "loss": 0.533, "step": 4600 }, { "epoch": 0.922, "grad_norm": 1.2034662961959839, "learning_rate": 8.747475838548088e-06, "loss": 0.4262, "step": 4610 }, { "epoch": 0.924, "grad_norm": 1.3047678470611572, "learning_rate": 8.739762864471392e-06, "loss": 0.5621, "step": 4620 }, { "epoch": 0.926, "grad_norm": 1.0258435010910034, "learning_rate": 8.732029638014249e-06, "loss": 0.5337, "step": 4630 }, { "epoch": 0.928, "grad_norm": 1.1698180437088013, "learning_rate": 8.724276201055311e-06, "loss": 0.4402, "step": 4640 }, { "epoch": 0.93, "grad_norm": 1.0054242610931396, "learning_rate": 8.716502595582685e-06, "loss": 0.5391, "step": 4650 }, { "epoch": 0.932, "grad_norm": 0.8926470875740051, "learning_rate": 8.708708863693696e-06, "loss": 0.4663, "step": 4660 }, { "epoch": 0.934, "grad_norm": 1.3373093605041504, "learning_rate": 8.700895047594664e-06, "loss": 0.5086, "step": 4670 }, { "epoch": 0.936, "grad_norm": 1.17959725856781, "learning_rate": 8.693061189600671e-06, "loss": 0.4284, "step": 4680 }, { "epoch": 0.938, "grad_norm": 1.0325318574905396, "learning_rate": 8.685207332135337e-06, "loss": 0.5204, "step": 4690 }, { "epoch": 0.94, "grad_norm": 0.8622048497200012, "learning_rate": 8.677333517730582e-06, "loss": 0.4318, "step": 4700 }, { "epoch": 0.942, "grad_norm": 1.214130163192749, "learning_rate": 8.669439789026409e-06, "loss": 0.4886, "step": 4710 }, { "epoch": 0.944, "grad_norm": 1.3919086456298828, "learning_rate": 8.66152618877066e-06, "loss": 0.4725, "step": 4720 }, { "epoch": 0.946, "grad_norm": 1.263291835784912, "learning_rate": 8.65359275981879e-06, "loss": 0.4854, "step": 4730 }, { "epoch": 0.948, "grad_norm": 1.5392141342163086, "learning_rate": 8.645639545133638e-06, "loss": 0.5634, "step": 4740 }, { "epoch": 0.95, "grad_norm": 1.246302843093872, "learning_rate": 8.637666587785185e-06, "loss": 0.4597, "step": 4750 }, { "epoch": 0.952, "grad_norm": 1.0350593328475952, "learning_rate": 8.629673930950335e-06, "loss": 0.4152, "step": 4760 }, { "epoch": 0.954, "grad_norm": 1.295961618423462, "learning_rate": 8.621661617912665e-06, "loss": 0.5035, "step": 4770 }, { "epoch": 0.956, "grad_norm": 1.3225280046463013, "learning_rate": 8.613629692062204e-06, "loss": 0.5491, "step": 4780 }, { "epoch": 0.958, "grad_norm": 1.495888352394104, "learning_rate": 8.60557819689519e-06, "loss": 0.5184, "step": 4790 }, { "epoch": 0.96, "grad_norm": 1.549146056175232, "learning_rate": 8.597507176013839e-06, "loss": 0.4976, "step": 4800 }, { "epoch": 0.962, "grad_norm": 1.1670464277267456, "learning_rate": 8.589416673126104e-06, "loss": 0.4262, "step": 4810 }, { "epoch": 0.964, "grad_norm": 1.1644515991210938, "learning_rate": 8.581306732045443e-06, "loss": 0.4646, "step": 4820 }, { "epoch": 0.966, "grad_norm": 0.8502632975578308, "learning_rate": 8.57317739669058e-06, "loss": 0.4702, "step": 4830 }, { "epoch": 0.968, "grad_norm": 1.3403635025024414, "learning_rate": 8.565028711085266e-06, "loss": 0.4864, "step": 4840 }, { "epoch": 0.97, "grad_norm": 1.1791622638702393, "learning_rate": 8.556860719358045e-06, "loss": 0.41, "step": 4850 }, { "epoch": 0.972, "grad_norm": 1.5249660015106201, "learning_rate": 8.548673465742006e-06, "loss": 0.4471, "step": 4860 }, { "epoch": 0.974, "grad_norm": 1.3172943592071533, "learning_rate": 8.540466994574556e-06, "loss": 0.4734, "step": 4870 }, { "epoch": 0.976, "grad_norm": 1.012715458869934, "learning_rate": 8.532241350297167e-06, "loss": 0.4765, "step": 4880 }, { "epoch": 0.978, "grad_norm": 0.9505335092544556, "learning_rate": 8.523996577455144e-06, "loss": 0.4513, "step": 4890 }, { "epoch": 0.98, "grad_norm": 0.9823141098022461, "learning_rate": 8.515732720697383e-06, "loss": 0.5406, "step": 4900 }, { "epoch": 0.982, "grad_norm": 1.1936461925506592, "learning_rate": 8.507449824776125e-06, "loss": 0.4852, "step": 4910 }, { "epoch": 0.984, "grad_norm": 1.1408801078796387, "learning_rate": 8.499147934546716e-06, "loss": 0.454, "step": 4920 }, { "epoch": 0.986, "grad_norm": 1.1299476623535156, "learning_rate": 8.490827094967364e-06, "loss": 0.4875, "step": 4930 }, { "epoch": 0.988, "grad_norm": 0.7930355668067932, "learning_rate": 8.482487351098899e-06, "loss": 0.4449, "step": 4940 }, { "epoch": 0.99, "grad_norm": 1.4795210361480713, "learning_rate": 8.47412874810452e-06, "loss": 0.5077, "step": 4950 }, { "epoch": 0.992, "grad_norm": 1.3691011667251587, "learning_rate": 8.465751331249558e-06, "loss": 0.4822, "step": 4960 }, { "epoch": 0.994, "grad_norm": 1.6543487310409546, "learning_rate": 8.457355145901235e-06, "loss": 0.4765, "step": 4970 }, { "epoch": 0.996, "grad_norm": 1.147316813468933, "learning_rate": 8.448940237528404e-06, "loss": 0.4662, "step": 4980 }, { "epoch": 0.998, "grad_norm": 1.3053276538848877, "learning_rate": 8.440506651701315e-06, "loss": 0.4694, "step": 4990 }, { "epoch": 1.0, "grad_norm": 0.914472758769989, "learning_rate": 8.43205443409136e-06, "loss": 0.3919, "step": 5000 }, { "epoch": 1.002, "grad_norm": 1.1292088031768799, "learning_rate": 8.42358363047084e-06, "loss": 0.2872, "step": 5010 }, { "epoch": 1.004, "grad_norm": 1.1585907936096191, "learning_rate": 8.415094286712694e-06, "loss": 0.3777, "step": 5020 }, { "epoch": 1.006, "grad_norm": 1.4778828620910645, "learning_rate": 8.406586448790277e-06, "loss": 0.3282, "step": 5030 }, { "epoch": 1.008, "grad_norm": 1.3356106281280518, "learning_rate": 8.398060162777084e-06, "loss": 0.3194, "step": 5040 }, { "epoch": 1.01, "grad_norm": 1.343839168548584, "learning_rate": 8.389515474846522e-06, "loss": 0.3194, "step": 5050 }, { "epoch": 1.012, "grad_norm": 1.3251718282699585, "learning_rate": 8.380952431271653e-06, "loss": 0.3459, "step": 5060 }, { "epoch": 1.014, "grad_norm": 1.7042008638381958, "learning_rate": 8.372371078424941e-06, "loss": 0.3321, "step": 5070 }, { "epoch": 1.016, "grad_norm": 1.3194146156311035, "learning_rate": 8.363771462778e-06, "loss": 0.3652, "step": 5080 }, { "epoch": 1.018, "grad_norm": 1.481685996055603, "learning_rate": 8.355153630901344e-06, "loss": 0.3853, "step": 5090 }, { "epoch": 1.02, "grad_norm": 1.6290403604507446, "learning_rate": 8.346517629464141e-06, "loss": 0.2857, "step": 5100 }, { "epoch": 1.022, "grad_norm": 0.9795701503753662, "learning_rate": 8.337863505233954e-06, "loss": 0.3388, "step": 5110 }, { "epoch": 1.024, "grad_norm": 1.4000308513641357, "learning_rate": 8.32919130507648e-06, "loss": 0.3922, "step": 5120 }, { "epoch": 1.026, "grad_norm": 1.081754446029663, "learning_rate": 8.320501075955317e-06, "loss": 0.2852, "step": 5130 }, { "epoch": 1.028, "grad_norm": 1.0564137697219849, "learning_rate": 8.311792864931686e-06, "loss": 0.3219, "step": 5140 }, { "epoch": 1.03, "grad_norm": 1.1438813209533691, "learning_rate": 8.303066719164195e-06, "loss": 0.2861, "step": 5150 }, { "epoch": 1.032, "grad_norm": 1.4927905797958374, "learning_rate": 8.294322685908576e-06, "loss": 0.3144, "step": 5160 }, { "epoch": 1.034, "grad_norm": 1.9864319562911987, "learning_rate": 8.285560812517423e-06, "loss": 0.3573, "step": 5170 }, { "epoch": 1.036, "grad_norm": 1.1521250009536743, "learning_rate": 8.27678114643995e-06, "loss": 0.3696, "step": 5180 }, { "epoch": 1.038, "grad_norm": 1.2286995649337769, "learning_rate": 8.26798373522172e-06, "loss": 0.3882, "step": 5190 }, { "epoch": 1.04, "grad_norm": 1.3970285654067993, "learning_rate": 8.259168626504395e-06, "loss": 0.3408, "step": 5200 }, { "epoch": 1.042, "grad_norm": 1.3147468566894531, "learning_rate": 8.250335868025477e-06, "loss": 0.3673, "step": 5210 }, { "epoch": 1.044, "grad_norm": 1.1939131021499634, "learning_rate": 8.241485507618046e-06, "loss": 0.3329, "step": 5220 }, { "epoch": 1.046, "grad_norm": 1.4440771341323853, "learning_rate": 8.232617593210512e-06, "loss": 0.3474, "step": 5230 }, { "epoch": 1.048, "grad_norm": 1.0242334604263306, "learning_rate": 8.223732172826336e-06, "loss": 0.361, "step": 5240 }, { "epoch": 1.05, "grad_norm": 1.4275943040847778, "learning_rate": 8.214829294583786e-06, "loss": 0.3043, "step": 5250 }, { "epoch": 1.052, "grad_norm": 1.5001543760299683, "learning_rate": 8.205909006695679e-06, "loss": 0.3676, "step": 5260 }, { "epoch": 1.054, "grad_norm": 1.3494583368301392, "learning_rate": 8.196971357469098e-06, "loss": 0.3486, "step": 5270 }, { "epoch": 1.056, "grad_norm": 1.2720623016357422, "learning_rate": 8.188016395305156e-06, "loss": 0.3145, "step": 5280 }, { "epoch": 1.058, "grad_norm": 1.5187513828277588, "learning_rate": 8.179044168698722e-06, "loss": 0.3581, "step": 5290 }, { "epoch": 1.06, "grad_norm": 2.041038990020752, "learning_rate": 8.170054726238152e-06, "loss": 0.3858, "step": 5300 }, { "epoch": 1.062, "grad_norm": 1.375152587890625, "learning_rate": 8.161048116605039e-06, "loss": 0.3401, "step": 5310 }, { "epoch": 1.064, "grad_norm": 0.8547447323799133, "learning_rate": 8.152024388573945e-06, "loss": 0.3006, "step": 5320 }, { "epoch": 1.066, "grad_norm": 2.131930351257324, "learning_rate": 8.142983591012128e-06, "loss": 0.3521, "step": 5330 }, { "epoch": 1.068, "grad_norm": 1.6276785135269165, "learning_rate": 8.133925772879292e-06, "loss": 0.3877, "step": 5340 }, { "epoch": 1.07, "grad_norm": 1.4579194784164429, "learning_rate": 8.124850983227313e-06, "loss": 0.3793, "step": 5350 }, { "epoch": 1.072, "grad_norm": 1.182732105255127, "learning_rate": 8.115759271199967e-06, "loss": 0.3425, "step": 5360 }, { "epoch": 1.074, "grad_norm": 1.5831196308135986, "learning_rate": 8.106650686032687e-06, "loss": 0.3424, "step": 5370 }, { "epoch": 1.076, "grad_norm": 0.9246402978897095, "learning_rate": 8.097525277052265e-06, "loss": 0.3245, "step": 5380 }, { "epoch": 1.078, "grad_norm": 1.0254276990890503, "learning_rate": 8.08838309367661e-06, "loss": 0.3218, "step": 5390 }, { "epoch": 1.08, "grad_norm": 0.9792593717575073, "learning_rate": 8.079224185414471e-06, "loss": 0.3681, "step": 5400 }, { "epoch": 1.082, "grad_norm": 1.3363689184188843, "learning_rate": 8.07004860186517e-06, "loss": 0.3799, "step": 5410 }, { "epoch": 1.084, "grad_norm": 1.571494698524475, "learning_rate": 8.060856392718326e-06, "loss": 0.2975, "step": 5420 }, { "epoch": 1.086, "grad_norm": 1.1808347702026367, "learning_rate": 8.051647607753598e-06, "loss": 0.3532, "step": 5430 }, { "epoch": 1.088, "grad_norm": 2.0380117893218994, "learning_rate": 8.04242229684041e-06, "loss": 0.3543, "step": 5440 }, { "epoch": 1.09, "grad_norm": 1.3502197265625, "learning_rate": 8.033180509937683e-06, "loss": 0.3566, "step": 5450 }, { "epoch": 1.092, "grad_norm": 1.074385404586792, "learning_rate": 8.023922297093557e-06, "loss": 0.3152, "step": 5460 }, { "epoch": 1.094, "grad_norm": 1.133074402809143, "learning_rate": 8.014647708445124e-06, "loss": 0.3784, "step": 5470 }, { "epoch": 1.096, "grad_norm": 1.1202969551086426, "learning_rate": 8.005356794218167e-06, "loss": 0.4327, "step": 5480 }, { "epoch": 1.098, "grad_norm": 1.4087947607040405, "learning_rate": 7.99604960472687e-06, "loss": 0.3206, "step": 5490 }, { "epoch": 1.1, "grad_norm": 1.140956163406372, "learning_rate": 7.986726190373562e-06, "loss": 0.2804, "step": 5500 }, { "epoch": 1.102, "grad_norm": 0.9047802686691284, "learning_rate": 7.977386601648427e-06, "loss": 0.3162, "step": 5510 }, { "epoch": 1.104, "grad_norm": 0.9571928381919861, "learning_rate": 7.968030889129247e-06, "loss": 0.2635, "step": 5520 }, { "epoch": 1.106, "grad_norm": 1.3515325784683228, "learning_rate": 7.95865910348112e-06, "loss": 0.3598, "step": 5530 }, { "epoch": 1.108, "grad_norm": 1.0597070455551147, "learning_rate": 7.949271295456187e-06, "loss": 0.2918, "step": 5540 }, { "epoch": 1.11, "grad_norm": 1.3461328744888306, "learning_rate": 7.939867515893353e-06, "loss": 0.2928, "step": 5550 }, { "epoch": 1.112, "grad_norm": 1.0447543859481812, "learning_rate": 7.930447815718022e-06, "loss": 0.3584, "step": 5560 }, { "epoch": 1.114, "grad_norm": 1.611246109008789, "learning_rate": 7.921012245941809e-06, "loss": 0.3816, "step": 5570 }, { "epoch": 1.116, "grad_norm": 0.886353611946106, "learning_rate": 7.91156085766227e-06, "loss": 0.322, "step": 5580 }, { "epoch": 1.1179999999999999, "grad_norm": 1.2039692401885986, "learning_rate": 7.90209370206263e-06, "loss": 0.2804, "step": 5590 }, { "epoch": 1.12, "grad_norm": 1.0296542644500732, "learning_rate": 7.892610830411496e-06, "loss": 0.3007, "step": 5600 }, { "epoch": 1.1219999999999999, "grad_norm": 1.3383187055587769, "learning_rate": 7.883112294062585e-06, "loss": 0.38, "step": 5610 }, { "epoch": 1.124, "grad_norm": 1.2576732635498047, "learning_rate": 7.873598144454444e-06, "loss": 0.3637, "step": 5620 }, { "epoch": 1.126, "grad_norm": 0.6940869688987732, "learning_rate": 7.864068433110176e-06, "loss": 0.2982, "step": 5630 }, { "epoch": 1.1280000000000001, "grad_norm": 1.15080726146698, "learning_rate": 7.854523211637152e-06, "loss": 0.31, "step": 5640 }, { "epoch": 1.13, "grad_norm": 1.8969014883041382, "learning_rate": 7.844962531726742e-06, "loss": 0.3375, "step": 5650 }, { "epoch": 1.1320000000000001, "grad_norm": 1.5290457010269165, "learning_rate": 7.835386445154023e-06, "loss": 0.3467, "step": 5660 }, { "epoch": 1.134, "grad_norm": 1.6540489196777344, "learning_rate": 7.825795003777515e-06, "loss": 0.3403, "step": 5670 }, { "epoch": 1.1360000000000001, "grad_norm": 1.4303158521652222, "learning_rate": 7.816188259538885e-06, "loss": 0.3727, "step": 5680 }, { "epoch": 1.138, "grad_norm": 1.2462937831878662, "learning_rate": 7.806566264462668e-06, "loss": 0.3325, "step": 5690 }, { "epoch": 1.1400000000000001, "grad_norm": 1.2328237295150757, "learning_rate": 7.796929070655994e-06, "loss": 0.3572, "step": 5700 }, { "epoch": 1.142, "grad_norm": 1.305349349975586, "learning_rate": 7.787276730308304e-06, "loss": 0.3046, "step": 5710 }, { "epoch": 1.144, "grad_norm": 1.3444890975952148, "learning_rate": 7.777609295691055e-06, "loss": 0.3464, "step": 5720 }, { "epoch": 1.146, "grad_norm": 0.8992980718612671, "learning_rate": 7.767926819157452e-06, "loss": 0.3742, "step": 5730 }, { "epoch": 1.148, "grad_norm": 1.571289300918579, "learning_rate": 7.758229353142153e-06, "loss": 0.3863, "step": 5740 }, { "epoch": 1.15, "grad_norm": 1.2571865320205688, "learning_rate": 7.748516950160993e-06, "loss": 0.3611, "step": 5750 }, { "epoch": 1.152, "grad_norm": 0.9523794651031494, "learning_rate": 7.738789662810702e-06, "loss": 0.3352, "step": 5760 }, { "epoch": 1.154, "grad_norm": 1.1043280363082886, "learning_rate": 7.729047543768608e-06, "loss": 0.4024, "step": 5770 }, { "epoch": 1.156, "grad_norm": 1.4577151536941528, "learning_rate": 7.719290645792361e-06, "loss": 0.3124, "step": 5780 }, { "epoch": 1.158, "grad_norm": 1.7957823276519775, "learning_rate": 7.709519021719644e-06, "loss": 0.4165, "step": 5790 }, { "epoch": 1.16, "grad_norm": 1.2084007263183594, "learning_rate": 7.699732724467894e-06, "loss": 0.3357, "step": 5800 }, { "epoch": 1.162, "grad_norm": 1.2520586252212524, "learning_rate": 7.689931807033999e-06, "loss": 0.3114, "step": 5810 }, { "epoch": 1.164, "grad_norm": 1.048161268234253, "learning_rate": 7.68011632249403e-06, "loss": 0.3216, "step": 5820 }, { "epoch": 1.166, "grad_norm": 1.231397032737732, "learning_rate": 7.670286324002943e-06, "loss": 0.3458, "step": 5830 }, { "epoch": 1.168, "grad_norm": 1.6105576753616333, "learning_rate": 7.66044186479429e-06, "loss": 0.3423, "step": 5840 }, { "epoch": 1.17, "grad_norm": 1.3911272287368774, "learning_rate": 7.650582998179939e-06, "loss": 0.3088, "step": 5850 }, { "epoch": 1.172, "grad_norm": 1.618174433708191, "learning_rate": 7.640709777549773e-06, "loss": 0.3516, "step": 5860 }, { "epoch": 1.174, "grad_norm": 2.1823925971984863, "learning_rate": 7.630822256371415e-06, "loss": 0.3054, "step": 5870 }, { "epoch": 1.176, "grad_norm": 1.236488938331604, "learning_rate": 7.620920488189929e-06, "loss": 0.3375, "step": 5880 }, { "epoch": 1.178, "grad_norm": 1.020044207572937, "learning_rate": 7.6110045266275305e-06, "loss": 0.2757, "step": 5890 }, { "epoch": 1.18, "grad_norm": 0.9718794822692871, "learning_rate": 7.601074425383302e-06, "loss": 0.3025, "step": 5900 }, { "epoch": 1.182, "grad_norm": 1.4841420650482178, "learning_rate": 7.591130238232892e-06, "loss": 0.3117, "step": 5910 }, { "epoch": 1.184, "grad_norm": 1.511138916015625, "learning_rate": 7.581172019028238e-06, "loss": 0.3936, "step": 5920 }, { "epoch": 1.186, "grad_norm": 1.3772696256637573, "learning_rate": 7.571199821697263e-06, "loss": 0.3486, "step": 5930 }, { "epoch": 1.188, "grad_norm": 1.3091078996658325, "learning_rate": 7.561213700243584e-06, "loss": 0.2936, "step": 5940 }, { "epoch": 1.19, "grad_norm": 1.3505816459655762, "learning_rate": 7.55121370874623e-06, "loss": 0.3203, "step": 5950 }, { "epoch": 1.192, "grad_norm": 1.3498326539993286, "learning_rate": 7.541199901359335e-06, "loss": 0.3734, "step": 5960 }, { "epoch": 1.194, "grad_norm": 1.5176153182983398, "learning_rate": 7.531172332311861e-06, "loss": 0.2997, "step": 5970 }, { "epoch": 1.196, "grad_norm": 0.9407598972320557, "learning_rate": 7.521131055907283e-06, "loss": 0.3204, "step": 5980 }, { "epoch": 1.198, "grad_norm": 1.3535823822021484, "learning_rate": 7.5110761265233156e-06, "loss": 0.2981, "step": 5990 }, { "epoch": 1.2, "grad_norm": 1.7967560291290283, "learning_rate": 7.501007598611609e-06, "loss": 0.3508, "step": 6000 }, { "epoch": 1.202, "grad_norm": 1.682611107826233, "learning_rate": 7.490925526697455e-06, "loss": 0.3258, "step": 6010 }, { "epoch": 1.204, "grad_norm": 1.2590186595916748, "learning_rate": 7.480829965379489e-06, "loss": 0.3376, "step": 6020 }, { "epoch": 1.206, "grad_norm": 1.1302205324172974, "learning_rate": 7.470720969329399e-06, "loss": 0.3297, "step": 6030 }, { "epoch": 1.208, "grad_norm": 1.1471021175384521, "learning_rate": 7.460598593291628e-06, "loss": 0.3566, "step": 6040 }, { "epoch": 1.21, "grad_norm": 2.0590806007385254, "learning_rate": 7.450462892083079e-06, "loss": 0.3354, "step": 6050 }, { "epoch": 1.212, "grad_norm": 1.6424793004989624, "learning_rate": 7.44031392059281e-06, "loss": 0.348, "step": 6060 }, { "epoch": 1.214, "grad_norm": 1.1267763376235962, "learning_rate": 7.430151733781752e-06, "loss": 0.324, "step": 6070 }, { "epoch": 1.216, "grad_norm": 1.049166202545166, "learning_rate": 7.419976386682395e-06, "loss": 0.3282, "step": 6080 }, { "epoch": 1.218, "grad_norm": 1.6423687934875488, "learning_rate": 7.409787934398502e-06, "loss": 0.2872, "step": 6090 }, { "epoch": 1.22, "grad_norm": 1.4791516065597534, "learning_rate": 7.3995864321048036e-06, "loss": 0.416, "step": 6100 }, { "epoch": 1.222, "grad_norm": 1.1335197687149048, "learning_rate": 7.389371935046703e-06, "loss": 0.3191, "step": 6110 }, { "epoch": 1.224, "grad_norm": 1.4202783107757568, "learning_rate": 7.3791444985399755e-06, "loss": 0.3429, "step": 6120 }, { "epoch": 1.226, "grad_norm": 1.132926106452942, "learning_rate": 7.368904177970466e-06, "loss": 0.3469, "step": 6130 }, { "epoch": 1.228, "grad_norm": 1.7757046222686768, "learning_rate": 7.358651028793797e-06, "loss": 0.333, "step": 6140 }, { "epoch": 1.23, "grad_norm": 1.3629705905914307, "learning_rate": 7.3483851065350595e-06, "loss": 0.3802, "step": 6150 }, { "epoch": 1.232, "grad_norm": 1.0564378499984741, "learning_rate": 7.33810646678852e-06, "loss": 0.3904, "step": 6160 }, { "epoch": 1.234, "grad_norm": 1.6500481367111206, "learning_rate": 7.327815165217309e-06, "loss": 0.3675, "step": 6170 }, { "epoch": 1.236, "grad_norm": 1.5274205207824707, "learning_rate": 7.317511257553131e-06, "loss": 0.3182, "step": 6180 }, { "epoch": 1.238, "grad_norm": 1.1405715942382812, "learning_rate": 7.307194799595958e-06, "loss": 0.3103, "step": 6190 }, { "epoch": 1.24, "grad_norm": 1.4158636331558228, "learning_rate": 7.296865847213724e-06, "loss": 0.3453, "step": 6200 }, { "epoch": 1.242, "grad_norm": 1.440067172050476, "learning_rate": 7.2865244563420304e-06, "loss": 0.4289, "step": 6210 }, { "epoch": 1.244, "grad_norm": 1.2499229907989502, "learning_rate": 7.27617068298383e-06, "loss": 0.3657, "step": 6220 }, { "epoch": 1.246, "grad_norm": 1.599689245223999, "learning_rate": 7.265804583209142e-06, "loss": 0.339, "step": 6230 }, { "epoch": 1.248, "grad_norm": 1.6074563264846802, "learning_rate": 7.25542621315473e-06, "loss": 0.3562, "step": 6240 }, { "epoch": 1.25, "grad_norm": 1.6778024435043335, "learning_rate": 7.245035629023812e-06, "loss": 0.313, "step": 6250 }, { "epoch": 1.252, "grad_norm": 1.2234922647476196, "learning_rate": 7.2346328870857465e-06, "loss": 0.3822, "step": 6260 }, { "epoch": 1.254, "grad_norm": 1.2148653268814087, "learning_rate": 7.224218043675735e-06, "loss": 0.3567, "step": 6270 }, { "epoch": 1.256, "grad_norm": 0.9696487188339233, "learning_rate": 7.21379115519451e-06, "loss": 0.3688, "step": 6280 }, { "epoch": 1.258, "grad_norm": 1.371671438217163, "learning_rate": 7.2033522781080325e-06, "loss": 0.3409, "step": 6290 }, { "epoch": 1.26, "grad_norm": 1.5369791984558105, "learning_rate": 7.192901468947193e-06, "loss": 0.3777, "step": 6300 }, { "epoch": 1.262, "grad_norm": 1.8359071016311646, "learning_rate": 7.182438784307495e-06, "loss": 0.3179, "step": 6310 }, { "epoch": 1.264, "grad_norm": 0.9479062557220459, "learning_rate": 7.171964280848749e-06, "loss": 0.2901, "step": 6320 }, { "epoch": 1.266, "grad_norm": 1.220046877861023, "learning_rate": 7.161478015294778e-06, "loss": 0.3478, "step": 6330 }, { "epoch": 1.268, "grad_norm": 1.2605116367340088, "learning_rate": 7.150980044433094e-06, "loss": 0.2351, "step": 6340 }, { "epoch": 1.27, "grad_norm": 1.6339104175567627, "learning_rate": 7.140470425114603e-06, "loss": 0.3123, "step": 6350 }, { "epoch": 1.272, "grad_norm": 1.1417044401168823, "learning_rate": 7.1299492142532876e-06, "loss": 0.3129, "step": 6360 }, { "epoch": 1.274, "grad_norm": 0.994657039642334, "learning_rate": 7.119416468825908e-06, "loss": 0.3247, "step": 6370 }, { "epoch": 1.276, "grad_norm": 1.182531714439392, "learning_rate": 7.108872245871687e-06, "loss": 0.2929, "step": 6380 }, { "epoch": 1.278, "grad_norm": 1.1829276084899902, "learning_rate": 7.098316602492004e-06, "loss": 0.3174, "step": 6390 }, { "epoch": 1.28, "grad_norm": 1.1945512294769287, "learning_rate": 7.087749595850084e-06, "loss": 0.3341, "step": 6400 }, { "epoch": 1.282, "grad_norm": 1.33237886428833, "learning_rate": 7.0771712831706855e-06, "loss": 0.3453, "step": 6410 }, { "epoch": 1.284, "grad_norm": 1.4239740371704102, "learning_rate": 7.066581721739801e-06, "loss": 0.3416, "step": 6420 }, { "epoch": 1.286, "grad_norm": 1.329602837562561, "learning_rate": 7.0559809689043325e-06, "loss": 0.3434, "step": 6430 }, { "epoch": 1.288, "grad_norm": 1.4223501682281494, "learning_rate": 7.045369082071793e-06, "loss": 0.3664, "step": 6440 }, { "epoch": 1.29, "grad_norm": 1.5064311027526855, "learning_rate": 7.034746118709989e-06, "loss": 0.3446, "step": 6450 }, { "epoch": 1.292, "grad_norm": 1.0884929895401, "learning_rate": 7.024112136346713e-06, "loss": 0.2676, "step": 6460 }, { "epoch": 1.294, "grad_norm": 1.623273491859436, "learning_rate": 7.013467192569427e-06, "loss": 0.4001, "step": 6470 }, { "epoch": 1.296, "grad_norm": 1.3760849237442017, "learning_rate": 7.002811345024951e-06, "loss": 0.3415, "step": 6480 }, { "epoch": 1.298, "grad_norm": 1.071457862854004, "learning_rate": 6.992144651419163e-06, "loss": 0.3584, "step": 6490 }, { "epoch": 1.3, "grad_norm": 1.245360255241394, "learning_rate": 6.981467169516671e-06, "loss": 0.341, "step": 6500 }, { "epoch": 1.302, "grad_norm": 1.653593897819519, "learning_rate": 6.9707789571405025e-06, "loss": 0.3626, "step": 6510 }, { "epoch": 1.304, "grad_norm": 0.9302300214767456, "learning_rate": 6.960080072171802e-06, "loss": 0.2995, "step": 6520 }, { "epoch": 1.306, "grad_norm": 1.358490228652954, "learning_rate": 6.949370572549506e-06, "loss": 0.3312, "step": 6530 }, { "epoch": 1.308, "grad_norm": 1.5318645238876343, "learning_rate": 6.938650516270038e-06, "loss": 0.3552, "step": 6540 }, { "epoch": 1.31, "grad_norm": 1.6989809274673462, "learning_rate": 6.927919961386984e-06, "loss": 0.3347, "step": 6550 }, { "epoch": 1.312, "grad_norm": 1.2937415838241577, "learning_rate": 6.9171789660107876e-06, "loss": 0.3167, "step": 6560 }, { "epoch": 1.314, "grad_norm": 1.2662570476531982, "learning_rate": 6.906427588308436e-06, "loss": 0.374, "step": 6570 }, { "epoch": 1.316, "grad_norm": 1.152688980102539, "learning_rate": 6.895665886503136e-06, "loss": 0.2952, "step": 6580 }, { "epoch": 1.318, "grad_norm": 1.0898213386535645, "learning_rate": 6.8848939188740034e-06, "loss": 0.3529, "step": 6590 }, { "epoch": 1.32, "grad_norm": 1.2499762773513794, "learning_rate": 6.874111743755751e-06, "loss": 0.335, "step": 6600 }, { "epoch": 1.322, "grad_norm": 1.5297431945800781, "learning_rate": 6.863319419538366e-06, "loss": 0.3896, "step": 6610 }, { "epoch": 1.324, "grad_norm": 1.6625542640686035, "learning_rate": 6.852517004666801e-06, "loss": 0.3455, "step": 6620 }, { "epoch": 1.326, "grad_norm": 1.4104151725769043, "learning_rate": 6.84170455764065e-06, "loss": 0.3535, "step": 6630 }, { "epoch": 1.328, "grad_norm": 1.2529921531677246, "learning_rate": 6.830882137013839e-06, "loss": 0.3219, "step": 6640 }, { "epoch": 1.33, "grad_norm": 1.6299757957458496, "learning_rate": 6.820049801394303e-06, "loss": 0.4037, "step": 6650 }, { "epoch": 1.332, "grad_norm": 1.215126872062683, "learning_rate": 6.80920760944367e-06, "loss": 0.3307, "step": 6660 }, { "epoch": 1.334, "grad_norm": 1.0372707843780518, "learning_rate": 6.798355619876944e-06, "loss": 0.3326, "step": 6670 }, { "epoch": 1.336, "grad_norm": 1.1610356569290161, "learning_rate": 6.787493891462191e-06, "loss": 0.3467, "step": 6680 }, { "epoch": 1.338, "grad_norm": 1.7156941890716553, "learning_rate": 6.776622483020214e-06, "loss": 0.4261, "step": 6690 }, { "epoch": 1.34, "grad_norm": 0.9658931493759155, "learning_rate": 6.765741453424237e-06, "loss": 0.307, "step": 6700 }, { "epoch": 1.342, "grad_norm": 0.8651520013809204, "learning_rate": 6.754850861599589e-06, "loss": 0.3802, "step": 6710 }, { "epoch": 1.3439999999999999, "grad_norm": 1.15030837059021, "learning_rate": 6.743950766523377e-06, "loss": 0.3026, "step": 6720 }, { "epoch": 1.346, "grad_norm": 1.2045297622680664, "learning_rate": 6.733041227224182e-06, "loss": 0.3673, "step": 6730 }, { "epoch": 1.3479999999999999, "grad_norm": 1.098310112953186, "learning_rate": 6.722122302781716e-06, "loss": 0.3305, "step": 6740 }, { "epoch": 1.35, "grad_norm": 1.5491714477539062, "learning_rate": 6.711194052326528e-06, "loss": 0.3063, "step": 6750 }, { "epoch": 1.3519999999999999, "grad_norm": 1.581486463546753, "learning_rate": 6.700256535039665e-06, "loss": 0.3534, "step": 6760 }, { "epoch": 1.354, "grad_norm": 1.3409374952316284, "learning_rate": 6.689309810152359e-06, "loss": 0.3866, "step": 6770 }, { "epoch": 1.3559999999999999, "grad_norm": 1.0301975011825562, "learning_rate": 6.678353936945704e-06, "loss": 0.3391, "step": 6780 }, { "epoch": 1.358, "grad_norm": 1.1352406740188599, "learning_rate": 6.6673889747503364e-06, "loss": 0.2808, "step": 6790 }, { "epoch": 1.3599999999999999, "grad_norm": 1.2140257358551025, "learning_rate": 6.656414982946115e-06, "loss": 0.3619, "step": 6800 }, { "epoch": 1.362, "grad_norm": 1.853940725326538, "learning_rate": 6.645432020961796e-06, "loss": 0.3014, "step": 6810 }, { "epoch": 1.3639999999999999, "grad_norm": 0.9603062272071838, "learning_rate": 6.634440148274712e-06, "loss": 0.3011, "step": 6820 }, { "epoch": 1.366, "grad_norm": 0.9446805715560913, "learning_rate": 6.623439424410456e-06, "loss": 0.3335, "step": 6830 }, { "epoch": 1.3679999999999999, "grad_norm": 1.3330634832382202, "learning_rate": 6.612429908942546e-06, "loss": 0.3776, "step": 6840 }, { "epoch": 1.37, "grad_norm": 1.3779356479644775, "learning_rate": 6.601411661492114e-06, "loss": 0.3175, "step": 6850 }, { "epoch": 1.3719999999999999, "grad_norm": 1.492652416229248, "learning_rate": 6.590384741727583e-06, "loss": 0.3377, "step": 6860 }, { "epoch": 1.374, "grad_norm": 1.719750165939331, "learning_rate": 6.579349209364332e-06, "loss": 0.3711, "step": 6870 }, { "epoch": 1.376, "grad_norm": 1.331905722618103, "learning_rate": 6.5683051241643894e-06, "loss": 0.3052, "step": 6880 }, { "epoch": 1.3780000000000001, "grad_norm": 1.1688127517700195, "learning_rate": 6.557252545936095e-06, "loss": 0.301, "step": 6890 }, { "epoch": 1.38, "grad_norm": 0.8940510153770447, "learning_rate": 6.546191534533783e-06, "loss": 0.3122, "step": 6900 }, { "epoch": 1.3820000000000001, "grad_norm": 1.363988995552063, "learning_rate": 6.53512214985746e-06, "loss": 0.3079, "step": 6910 }, { "epoch": 1.384, "grad_norm": 0.9122200608253479, "learning_rate": 6.5240444518524736e-06, "loss": 0.3486, "step": 6920 }, { "epoch": 1.3860000000000001, "grad_norm": 1.0334652662277222, "learning_rate": 6.512958500509193e-06, "loss": 0.2696, "step": 6930 }, { "epoch": 1.388, "grad_norm": 1.257965326309204, "learning_rate": 6.501864355862682e-06, "loss": 0.3648, "step": 6940 }, { "epoch": 1.3900000000000001, "grad_norm": 1.1571115255355835, "learning_rate": 6.490762077992376e-06, "loss": 0.3474, "step": 6950 }, { "epoch": 1.392, "grad_norm": 1.2089656591415405, "learning_rate": 6.479651727021754e-06, "loss": 0.3313, "step": 6960 }, { "epoch": 1.3940000000000001, "grad_norm": 1.4407920837402344, "learning_rate": 6.4685333631180145e-06, "loss": 0.3695, "step": 6970 }, { "epoch": 1.396, "grad_norm": 2.0778284072875977, "learning_rate": 6.457407046491748e-06, "loss": 0.2625, "step": 6980 }, { "epoch": 1.3980000000000001, "grad_norm": 1.36530339717865, "learning_rate": 6.4462728373966165e-06, "loss": 0.3015, "step": 6990 }, { "epoch": 1.4, "grad_norm": 1.8104184865951538, "learning_rate": 6.435130796129019e-06, "loss": 0.4064, "step": 7000 }, { "epoch": 1.4020000000000001, "grad_norm": 1.3507591485977173, "learning_rate": 6.423980983027769e-06, "loss": 0.3225, "step": 7010 }, { "epoch": 1.404, "grad_norm": 0.7463653087615967, "learning_rate": 6.412823458473772e-06, "loss": 0.2916, "step": 7020 }, { "epoch": 1.4060000000000001, "grad_norm": 1.5296553373336792, "learning_rate": 6.401658282889689e-06, "loss": 0.3413, "step": 7030 }, { "epoch": 1.408, "grad_norm": 1.1751571893692017, "learning_rate": 6.390485516739616e-06, "loss": 0.3598, "step": 7040 }, { "epoch": 1.41, "grad_norm": 1.090745449066162, "learning_rate": 6.379305220528758e-06, "loss": 0.3398, "step": 7050 }, { "epoch": 1.412, "grad_norm": 1.3860448598861694, "learning_rate": 6.368117454803093e-06, "loss": 0.2953, "step": 7060 }, { "epoch": 1.414, "grad_norm": 1.4662338495254517, "learning_rate": 6.356922280149058e-06, "loss": 0.3662, "step": 7070 }, { "epoch": 1.416, "grad_norm": 1.0763154029846191, "learning_rate": 6.345719757193203e-06, "loss": 0.3182, "step": 7080 }, { "epoch": 1.418, "grad_norm": 1.2442153692245483, "learning_rate": 6.334509946601879e-06, "loss": 0.3177, "step": 7090 }, { "epoch": 1.42, "grad_norm": 1.304867148399353, "learning_rate": 6.323292909080897e-06, "loss": 0.3322, "step": 7100 }, { "epoch": 1.422, "grad_norm": 1.5023126602172852, "learning_rate": 6.3120687053752114e-06, "loss": 0.3425, "step": 7110 }, { "epoch": 1.424, "grad_norm": 1.539007306098938, "learning_rate": 6.3008373962685785e-06, "loss": 0.3376, "step": 7120 }, { "epoch": 1.426, "grad_norm": 1.6825087070465088, "learning_rate": 6.289599042583237e-06, "loss": 0.364, "step": 7130 }, { "epoch": 1.428, "grad_norm": 1.3176345825195312, "learning_rate": 6.278353705179572e-06, "loss": 0.3753, "step": 7140 }, { "epoch": 1.43, "grad_norm": 1.2803112268447876, "learning_rate": 6.267101444955792e-06, "loss": 0.3295, "step": 7150 }, { "epoch": 1.432, "grad_norm": 1.4513520002365112, "learning_rate": 6.255842322847594e-06, "loss": 0.3179, "step": 7160 }, { "epoch": 1.434, "grad_norm": 1.314857006072998, "learning_rate": 6.244576399827831e-06, "loss": 0.3537, "step": 7170 }, { "epoch": 1.436, "grad_norm": 1.1115436553955078, "learning_rate": 6.233303736906193e-06, "loss": 0.313, "step": 7180 }, { "epoch": 1.438, "grad_norm": 1.1615487337112427, "learning_rate": 6.222024395128864e-06, "loss": 0.3293, "step": 7190 }, { "epoch": 1.44, "grad_norm": 1.4486738443374634, "learning_rate": 6.210738435578198e-06, "loss": 0.3793, "step": 7200 }, { "epoch": 1.442, "grad_norm": 1.143863558769226, "learning_rate": 6.199445919372388e-06, "loss": 0.4045, "step": 7210 }, { "epoch": 1.444, "grad_norm": 1.4991508722305298, "learning_rate": 6.1881469076651336e-06, "loss": 0.3216, "step": 7220 }, { "epoch": 1.446, "grad_norm": 1.1230875253677368, "learning_rate": 6.176841461645311e-06, "loss": 0.3679, "step": 7230 }, { "epoch": 1.448, "grad_norm": 1.141588568687439, "learning_rate": 6.16552964253664e-06, "loss": 0.3378, "step": 7240 }, { "epoch": 1.45, "grad_norm": 1.8605092763900757, "learning_rate": 6.15421151159735e-06, "loss": 0.3412, "step": 7250 }, { "epoch": 1.452, "grad_norm": 1.14920175075531, "learning_rate": 6.14288713011986e-06, "loss": 0.2663, "step": 7260 }, { "epoch": 1.454, "grad_norm": 1.7807815074920654, "learning_rate": 6.13155655943043e-06, "loss": 0.3288, "step": 7270 }, { "epoch": 1.456, "grad_norm": 1.6375200748443604, "learning_rate": 6.1202198608888416e-06, "loss": 0.4102, "step": 7280 }, { "epoch": 1.458, "grad_norm": 1.5959913730621338, "learning_rate": 6.1088770958880595e-06, "loss": 0.3615, "step": 7290 }, { "epoch": 1.46, "grad_norm": 1.452133297920227, "learning_rate": 6.097528325853903e-06, "loss": 0.2841, "step": 7300 }, { "epoch": 1.462, "grad_norm": 1.8519127368927002, "learning_rate": 6.086173612244708e-06, "loss": 0.3331, "step": 7310 }, { "epoch": 1.464, "grad_norm": 1.0498560667037964, "learning_rate": 6.074813016550998e-06, "loss": 0.2939, "step": 7320 }, { "epoch": 1.466, "grad_norm": 1.369956612586975, "learning_rate": 6.0634466002951545e-06, "loss": 0.3859, "step": 7330 }, { "epoch": 1.468, "grad_norm": 1.7278661727905273, "learning_rate": 6.052074425031075e-06, "loss": 0.3599, "step": 7340 }, { "epoch": 1.47, "grad_norm": 1.1767780780792236, "learning_rate": 6.040696552343845e-06, "loss": 0.3392, "step": 7350 }, { "epoch": 1.472, "grad_norm": 1.4453753232955933, "learning_rate": 6.029313043849407e-06, "loss": 0.4165, "step": 7360 }, { "epoch": 1.474, "grad_norm": 1.0843770503997803, "learning_rate": 6.017923961194221e-06, "loss": 0.334, "step": 7370 }, { "epoch": 1.476, "grad_norm": 1.1147040128707886, "learning_rate": 6.006529366054935e-06, "loss": 0.3453, "step": 7380 }, { "epoch": 1.478, "grad_norm": 1.7290093898773193, "learning_rate": 5.995129320138047e-06, "loss": 0.3127, "step": 7390 }, { "epoch": 1.48, "grad_norm": 1.4090754985809326, "learning_rate": 5.983723885179576e-06, "loss": 0.3197, "step": 7400 }, { "epoch": 1.482, "grad_norm": 1.1394073963165283, "learning_rate": 5.972313122944724e-06, "loss": 0.3234, "step": 7410 }, { "epoch": 1.484, "grad_norm": 1.281721591949463, "learning_rate": 5.960897095227541e-06, "loss": 0.3311, "step": 7420 }, { "epoch": 1.486, "grad_norm": 0.9098526239395142, "learning_rate": 5.949475863850595e-06, "loss": 0.3342, "step": 7430 }, { "epoch": 1.488, "grad_norm": 0.939958930015564, "learning_rate": 5.938049490664629e-06, "loss": 0.3139, "step": 7440 }, { "epoch": 1.49, "grad_norm": 1.2372125387191772, "learning_rate": 5.926618037548237e-06, "loss": 0.3121, "step": 7450 }, { "epoch": 1.492, "grad_norm": 1.0505400896072388, "learning_rate": 5.915181566407519e-06, "loss": 0.3165, "step": 7460 }, { "epoch": 1.494, "grad_norm": 1.4748669862747192, "learning_rate": 5.903740139175752e-06, "loss": 0.3584, "step": 7470 }, { "epoch": 1.496, "grad_norm": 1.09158456325531, "learning_rate": 5.892293817813048e-06, "loss": 0.3162, "step": 7480 }, { "epoch": 1.498, "grad_norm": 1.4084733724594116, "learning_rate": 5.8808426643060265e-06, "loss": 0.3474, "step": 7490 }, { "epoch": 1.5, "grad_norm": 1.2051564455032349, "learning_rate": 5.869386740667478e-06, "loss": 0.3169, "step": 7500 }, { "epoch": 1.502, "grad_norm": 1.2120217084884644, "learning_rate": 5.857926108936015e-06, "loss": 0.3545, "step": 7510 }, { "epoch": 1.504, "grad_norm": 1.084529161453247, "learning_rate": 5.84646083117576e-06, "loss": 0.3407, "step": 7520 }, { "epoch": 1.506, "grad_norm": 2.2102389335632324, "learning_rate": 5.834990969475984e-06, "loss": 0.3275, "step": 7530 }, { "epoch": 1.508, "grad_norm": 1.6285516023635864, "learning_rate": 5.823516585950787e-06, "loss": 0.2969, "step": 7540 }, { "epoch": 1.51, "grad_norm": 1.2168736457824707, "learning_rate": 5.812037742738759e-06, "loss": 0.3845, "step": 7550 }, { "epoch": 1.512, "grad_norm": 1.2020785808563232, "learning_rate": 5.800554502002635e-06, "loss": 0.2706, "step": 7560 }, { "epoch": 1.514, "grad_norm": 1.3338371515274048, "learning_rate": 5.78906692592897e-06, "loss": 0.3217, "step": 7570 }, { "epoch": 1.516, "grad_norm": 1.2562854290008545, "learning_rate": 5.77757507672779e-06, "loss": 0.338, "step": 7580 }, { "epoch": 1.518, "grad_norm": 1.2902873754501343, "learning_rate": 5.766079016632272e-06, "loss": 0.3115, "step": 7590 }, { "epoch": 1.52, "grad_norm": 1.3528056144714355, "learning_rate": 5.7545788078983875e-06, "loss": 0.3526, "step": 7600 }, { "epoch": 1.522, "grad_norm": 0.9707136750221252, "learning_rate": 5.743074512804579e-06, "loss": 0.3693, "step": 7610 }, { "epoch": 1.524, "grad_norm": 1.6493264436721802, "learning_rate": 5.731566193651416e-06, "loss": 0.3304, "step": 7620 }, { "epoch": 1.526, "grad_norm": 1.095731258392334, "learning_rate": 5.720053912761261e-06, "loss": 0.3102, "step": 7630 }, { "epoch": 1.528, "grad_norm": 1.691691279411316, "learning_rate": 5.708537732477934e-06, "loss": 0.3019, "step": 7640 }, { "epoch": 1.53, "grad_norm": 1.4168593883514404, "learning_rate": 5.697017715166366e-06, "loss": 0.3338, "step": 7650 }, { "epoch": 1.532, "grad_norm": 1.5588029623031616, "learning_rate": 5.685493923212273e-06, "loss": 0.2633, "step": 7660 }, { "epoch": 1.534, "grad_norm": 1.6496397256851196, "learning_rate": 5.673966419021806e-06, "loss": 0.4173, "step": 7670 }, { "epoch": 1.536, "grad_norm": 1.0448615550994873, "learning_rate": 5.662435265021225e-06, "loss": 0.2849, "step": 7680 }, { "epoch": 1.538, "grad_norm": 1.2294347286224365, "learning_rate": 5.650900523656553e-06, "loss": 0.2609, "step": 7690 }, { "epoch": 1.54, "grad_norm": 1.5806496143341064, "learning_rate": 5.63936225739324e-06, "loss": 0.3771, "step": 7700 }, { "epoch": 1.542, "grad_norm": 1.1037795543670654, "learning_rate": 5.627820528715824e-06, "loss": 0.3149, "step": 7710 }, { "epoch": 1.544, "grad_norm": 0.8963010907173157, "learning_rate": 5.616275400127594e-06, "loss": 0.3073, "step": 7720 }, { "epoch": 1.546, "grad_norm": 1.5788958072662354, "learning_rate": 5.604726934150253e-06, "loss": 0.3628, "step": 7730 }, { "epoch": 1.548, "grad_norm": 1.0473259687423706, "learning_rate": 5.593175193323574e-06, "loss": 0.3114, "step": 7740 }, { "epoch": 1.55, "grad_norm": 1.295811414718628, "learning_rate": 5.581620240205068e-06, "loss": 0.4363, "step": 7750 }, { "epoch": 1.552, "grad_norm": 1.9268922805786133, "learning_rate": 5.57006213736964e-06, "loss": 0.2928, "step": 7760 }, { "epoch": 1.554, "grad_norm": 1.4225900173187256, "learning_rate": 5.558500947409249e-06, "loss": 0.3989, "step": 7770 }, { "epoch": 1.556, "grad_norm": 1.2886687517166138, "learning_rate": 5.546936732932578e-06, "loss": 0.3041, "step": 7780 }, { "epoch": 1.558, "grad_norm": 1.265717625617981, "learning_rate": 5.535369556564687e-06, "loss": 0.3384, "step": 7790 }, { "epoch": 1.56, "grad_norm": 1.2660807371139526, "learning_rate": 5.523799480946673e-06, "loss": 0.294, "step": 7800 }, { "epoch": 1.562, "grad_norm": 1.319225788116455, "learning_rate": 5.512226568735338e-06, "loss": 0.3159, "step": 7810 }, { "epoch": 1.564, "grad_norm": 1.193955659866333, "learning_rate": 5.500650882602842e-06, "loss": 0.3044, "step": 7820 }, { "epoch": 1.5659999999999998, "grad_norm": 1.2253046035766602, "learning_rate": 5.489072485236368e-06, "loss": 0.3376, "step": 7830 }, { "epoch": 1.568, "grad_norm": 1.0734641551971436, "learning_rate": 5.477491439337782e-06, "loss": 0.3555, "step": 7840 }, { "epoch": 1.5699999999999998, "grad_norm": 1.7126284837722778, "learning_rate": 5.4659078076232906e-06, "loss": 0.3871, "step": 7850 }, { "epoch": 1.572, "grad_norm": 1.323228359222412, "learning_rate": 5.45432165282311e-06, "loss": 0.3337, "step": 7860 }, { "epoch": 1.5739999999999998, "grad_norm": 1.7261728048324585, "learning_rate": 5.442733037681112e-06, "loss": 0.2912, "step": 7870 }, { "epoch": 1.576, "grad_norm": 1.790432095527649, "learning_rate": 5.431142024954496e-06, "loss": 0.3271, "step": 7880 }, { "epoch": 1.5779999999999998, "grad_norm": 1.6568429470062256, "learning_rate": 5.419548677413445e-06, "loss": 0.3535, "step": 7890 }, { "epoch": 1.58, "grad_norm": 1.545526146888733, "learning_rate": 5.4079530578407895e-06, "loss": 0.348, "step": 7900 }, { "epoch": 1.5819999999999999, "grad_norm": 1.8154616355895996, "learning_rate": 5.396355229031656e-06, "loss": 0.2892, "step": 7910 }, { "epoch": 1.584, "grad_norm": 0.9787377715110779, "learning_rate": 5.3847552537931395e-06, "loss": 0.2931, "step": 7920 }, { "epoch": 1.5859999999999999, "grad_norm": 1.3805267810821533, "learning_rate": 5.373153194943962e-06, "loss": 0.3615, "step": 7930 }, { "epoch": 1.588, "grad_norm": 2.3790717124938965, "learning_rate": 5.361549115314123e-06, "loss": 0.3537, "step": 7940 }, { "epoch": 1.5899999999999999, "grad_norm": 1.5367940664291382, "learning_rate": 5.349943077744573e-06, "loss": 0.3413, "step": 7950 }, { "epoch": 1.592, "grad_norm": 1.2089755535125732, "learning_rate": 5.338335145086855e-06, "loss": 0.3632, "step": 7960 }, { "epoch": 1.5939999999999999, "grad_norm": 1.4796146154403687, "learning_rate": 5.3267253802027826e-06, "loss": 0.4095, "step": 7970 }, { "epoch": 1.596, "grad_norm": 1.4473693370819092, "learning_rate": 5.315113845964091e-06, "loss": 0.3714, "step": 7980 }, { "epoch": 1.5979999999999999, "grad_norm": 1.5862455368041992, "learning_rate": 5.303500605252095e-06, "loss": 0.3283, "step": 7990 }, { "epoch": 1.6, "grad_norm": 1.4115406274795532, "learning_rate": 5.291885720957351e-06, "loss": 0.3127, "step": 8000 }, { "epoch": 1.6019999999999999, "grad_norm": 1.6145530939102173, "learning_rate": 5.2802692559793175e-06, "loss": 0.4085, "step": 8010 }, { "epoch": 1.604, "grad_norm": 1.8569374084472656, "learning_rate": 5.268651273226011e-06, "loss": 0.2898, "step": 8020 }, { "epoch": 1.6059999999999999, "grad_norm": 1.736876368522644, "learning_rate": 5.2570318356136705e-06, "loss": 0.3503, "step": 8030 }, { "epoch": 1.608, "grad_norm": 1.5102406740188599, "learning_rate": 5.2454110060664075e-06, "loss": 0.3111, "step": 8040 }, { "epoch": 1.6099999999999999, "grad_norm": 1.7225403785705566, "learning_rate": 5.233788847515882e-06, "loss": 0.3586, "step": 8050 }, { "epoch": 1.612, "grad_norm": 1.0642247200012207, "learning_rate": 5.222165422900939e-06, "loss": 0.3721, "step": 8060 }, { "epoch": 1.6139999999999999, "grad_norm": 1.3807940483093262, "learning_rate": 5.210540795167287e-06, "loss": 0.398, "step": 8070 }, { "epoch": 1.616, "grad_norm": 1.396348237991333, "learning_rate": 5.19891502726715e-06, "loss": 0.3657, "step": 8080 }, { "epoch": 1.6179999999999999, "grad_norm": 0.8553020358085632, "learning_rate": 5.187288182158924e-06, "loss": 0.3018, "step": 8090 }, { "epoch": 1.62, "grad_norm": 1.608389973640442, "learning_rate": 5.175660322806838e-06, "loss": 0.3563, "step": 8100 }, { "epoch": 1.6219999999999999, "grad_norm": 1.3906745910644531, "learning_rate": 5.164031512180616e-06, "loss": 0.3186, "step": 8110 }, { "epoch": 1.624, "grad_norm": 1.6109449863433838, "learning_rate": 5.152401813255134e-06, "loss": 0.3028, "step": 8120 }, { "epoch": 1.626, "grad_norm": 1.3031402826309204, "learning_rate": 5.140771289010073e-06, "loss": 0.3089, "step": 8130 }, { "epoch": 1.6280000000000001, "grad_norm": 0.8983224034309387, "learning_rate": 5.1291400024295946e-06, "loss": 0.3388, "step": 8140 }, { "epoch": 1.63, "grad_norm": 1.6017338037490845, "learning_rate": 5.117508016501973e-06, "loss": 0.3465, "step": 8150 }, { "epoch": 1.6320000000000001, "grad_norm": 1.4374866485595703, "learning_rate": 5.105875394219283e-06, "loss": 0.3152, "step": 8160 }, { "epoch": 1.634, "grad_norm": 1.3967483043670654, "learning_rate": 5.0942421985770415e-06, "loss": 0.3158, "step": 8170 }, { "epoch": 1.6360000000000001, "grad_norm": 1.0798684358596802, "learning_rate": 5.0826084925738675e-06, "loss": 0.3273, "step": 8180 }, { "epoch": 1.638, "grad_norm": 1.1431955099105835, "learning_rate": 5.070974339211148e-06, "loss": 0.4185, "step": 8190 }, { "epoch": 1.6400000000000001, "grad_norm": 1.4295889139175415, "learning_rate": 5.059339801492687e-06, "loss": 0.3667, "step": 8200 }, { "epoch": 1.642, "grad_norm": 1.2767932415008545, "learning_rate": 5.047704942424377e-06, "loss": 0.3993, "step": 8210 }, { "epoch": 1.6440000000000001, "grad_norm": 1.4808293581008911, "learning_rate": 5.0360698250138465e-06, "loss": 0.3618, "step": 8220 }, { "epoch": 1.646, "grad_norm": 1.2203787565231323, "learning_rate": 5.024434512270123e-06, "loss": 0.3237, "step": 8230 }, { "epoch": 1.6480000000000001, "grad_norm": 1.2480844259262085, "learning_rate": 5.0127990672032945e-06, "loss": 0.3361, "step": 8240 }, { "epoch": 1.65, "grad_norm": 1.269690990447998, "learning_rate": 5.001163552824162e-06, "loss": 0.2754, "step": 8250 }, { "epoch": 1.6520000000000001, "grad_norm": 1.5688775777816772, "learning_rate": 4.9895280321439036e-06, "loss": 0.3312, "step": 8260 }, { "epoch": 1.654, "grad_norm": 1.5578945875167847, "learning_rate": 4.977892568173733e-06, "loss": 0.3141, "step": 8270 }, { "epoch": 1.6560000000000001, "grad_norm": 1.0505149364471436, "learning_rate": 4.966257223924554e-06, "loss": 0.2919, "step": 8280 }, { "epoch": 1.658, "grad_norm": 1.2484036684036255, "learning_rate": 4.954622062406623e-06, "loss": 0.3513, "step": 8290 }, { "epoch": 1.6600000000000001, "grad_norm": 1.8447239398956299, "learning_rate": 4.94298714662921e-06, "loss": 0.3111, "step": 8300 }, { "epoch": 1.662, "grad_norm": 1.375136137008667, "learning_rate": 4.931352539600248e-06, "loss": 0.2827, "step": 8310 }, { "epoch": 1.6640000000000001, "grad_norm": 1.4478310346603394, "learning_rate": 4.9197183043260035e-06, "loss": 0.3177, "step": 8320 }, { "epoch": 1.666, "grad_norm": 1.2656853199005127, "learning_rate": 4.9080845038107264e-06, "loss": 0.2888, "step": 8330 }, { "epoch": 1.6680000000000001, "grad_norm": 1.177017092704773, "learning_rate": 4.896451201056315e-06, "loss": 0.365, "step": 8340 }, { "epoch": 1.67, "grad_norm": 1.2763571739196777, "learning_rate": 4.88481845906197e-06, "loss": 0.2697, "step": 8350 }, { "epoch": 1.6720000000000002, "grad_norm": 1.6193398237228394, "learning_rate": 4.873186340823854e-06, "loss": 0.2913, "step": 8360 }, { "epoch": 1.674, "grad_norm": 1.353211760520935, "learning_rate": 4.861554909334757e-06, "loss": 0.3652, "step": 8370 }, { "epoch": 1.6760000000000002, "grad_norm": 1.4687409400939941, "learning_rate": 4.8499242275837444e-06, "loss": 0.3417, "step": 8380 }, { "epoch": 1.678, "grad_norm": 1.644424557685852, "learning_rate": 4.838294358555824e-06, "loss": 0.3228, "step": 8390 }, { "epoch": 1.6800000000000002, "grad_norm": 1.8202049732208252, "learning_rate": 4.826665365231601e-06, "loss": 0.2984, "step": 8400 }, { "epoch": 1.682, "grad_norm": 0.9253039360046387, "learning_rate": 4.815037310586941e-06, "loss": 0.3807, "step": 8410 }, { "epoch": 1.6840000000000002, "grad_norm": 1.1849908828735352, "learning_rate": 4.803410257592625e-06, "loss": 0.2896, "step": 8420 }, { "epoch": 1.686, "grad_norm": 1.0804787874221802, "learning_rate": 4.791784269214004e-06, "loss": 0.3129, "step": 8430 }, { "epoch": 1.688, "grad_norm": 1.3081520795822144, "learning_rate": 4.780159408410677e-06, "loss": 0.3055, "step": 8440 }, { "epoch": 1.69, "grad_norm": 1.2547225952148438, "learning_rate": 4.7685357381361224e-06, "loss": 0.3321, "step": 8450 }, { "epoch": 1.692, "grad_norm": 1.228356957435608, "learning_rate": 4.75691332133738e-06, "loss": 0.3556, "step": 8460 }, { "epoch": 1.694, "grad_norm": 1.2856372594833374, "learning_rate": 4.745292220954696e-06, "loss": 0.3334, "step": 8470 }, { "epoch": 1.696, "grad_norm": 1.2076910734176636, "learning_rate": 4.733672499921195e-06, "loss": 0.2957, "step": 8480 }, { "epoch": 1.698, "grad_norm": 1.3459007740020752, "learning_rate": 4.722054221162528e-06, "loss": 0.3326, "step": 8490 }, { "epoch": 1.7, "grad_norm": 1.899490475654602, "learning_rate": 4.710437447596528e-06, "loss": 0.3721, "step": 8500 }, { "epoch": 1.702, "grad_norm": 1.0619953870773315, "learning_rate": 4.698822242132891e-06, "loss": 0.3474, "step": 8510 }, { "epoch": 1.704, "grad_norm": 1.1741877794265747, "learning_rate": 4.687208667672812e-06, "loss": 0.3206, "step": 8520 }, { "epoch": 1.706, "grad_norm": 1.295904517173767, "learning_rate": 4.675596787108652e-06, "loss": 0.3445, "step": 8530 }, { "epoch": 1.708, "grad_norm": 1.2269822359085083, "learning_rate": 4.6639866633236066e-06, "loss": 0.32, "step": 8540 }, { "epoch": 1.71, "grad_norm": 1.142608404159546, "learning_rate": 4.652378359191352e-06, "loss": 0.3178, "step": 8550 }, { "epoch": 1.712, "grad_norm": 1.4112498760223389, "learning_rate": 4.6407719375757095e-06, "loss": 0.3598, "step": 8560 }, { "epoch": 1.714, "grad_norm": 1.3373513221740723, "learning_rate": 4.629167461330308e-06, "loss": 0.3895, "step": 8570 }, { "epoch": 1.716, "grad_norm": 1.5300648212432861, "learning_rate": 4.617564993298244e-06, "loss": 0.3675, "step": 8580 }, { "epoch": 1.718, "grad_norm": 1.3708034753799438, "learning_rate": 4.605964596311733e-06, "loss": 0.3543, "step": 8590 }, { "epoch": 1.72, "grad_norm": 1.1198241710662842, "learning_rate": 4.594366333191778e-06, "loss": 0.4109, "step": 8600 }, { "epoch": 1.722, "grad_norm": 1.1231240034103394, "learning_rate": 4.5827702667478275e-06, "loss": 0.3643, "step": 8610 }, { "epoch": 1.724, "grad_norm": 1.6655538082122803, "learning_rate": 4.571176459777431e-06, "loss": 0.3271, "step": 8620 }, { "epoch": 1.726, "grad_norm": 1.260842204093933, "learning_rate": 4.559584975065905e-06, "loss": 0.3342, "step": 8630 }, { "epoch": 1.728, "grad_norm": 1.2399426698684692, "learning_rate": 4.547995875385986e-06, "loss": 0.2852, "step": 8640 }, { "epoch": 1.73, "grad_norm": 1.537028193473816, "learning_rate": 4.5364092234975e-06, "loss": 0.4146, "step": 8650 }, { "epoch": 1.732, "grad_norm": 1.4281028509140015, "learning_rate": 4.524825082147013e-06, "loss": 0.2961, "step": 8660 }, { "epoch": 1.734, "grad_norm": 1.294998049736023, "learning_rate": 4.513243514067495e-06, "loss": 0.2937, "step": 8670 }, { "epoch": 1.736, "grad_norm": 1.0920134782791138, "learning_rate": 4.5016645819779865e-06, "loss": 0.3434, "step": 8680 }, { "epoch": 1.738, "grad_norm": 1.160057544708252, "learning_rate": 4.490088348583246e-06, "loss": 0.2608, "step": 8690 }, { "epoch": 1.74, "grad_norm": 1.2526758909225464, "learning_rate": 4.47851487657342e-06, "loss": 0.3652, "step": 8700 }, { "epoch": 1.742, "grad_norm": 1.275640845298767, "learning_rate": 4.466944228623701e-06, "loss": 0.3862, "step": 8710 }, { "epoch": 1.744, "grad_norm": 1.3032732009887695, "learning_rate": 4.455376467393991e-06, "loss": 0.2721, "step": 8720 }, { "epoch": 1.746, "grad_norm": 1.2573343515396118, "learning_rate": 4.443811655528553e-06, "loss": 0.3276, "step": 8730 }, { "epoch": 1.748, "grad_norm": 1.478424310684204, "learning_rate": 4.432249855655681e-06, "loss": 0.3695, "step": 8740 }, { "epoch": 1.75, "grad_norm": 1.3663966655731201, "learning_rate": 4.420691130387365e-06, "loss": 0.3233, "step": 8750 }, { "epoch": 1.752, "grad_norm": 1.5956822633743286, "learning_rate": 4.409135542318931e-06, "loss": 0.3789, "step": 8760 }, { "epoch": 1.754, "grad_norm": 1.566262125968933, "learning_rate": 4.397583154028725e-06, "loss": 0.3546, "step": 8770 }, { "epoch": 1.756, "grad_norm": 1.0735946893692017, "learning_rate": 4.38603402807776e-06, "loss": 0.3638, "step": 8780 }, { "epoch": 1.758, "grad_norm": 1.182511329650879, "learning_rate": 4.374488227009391e-06, "loss": 0.3271, "step": 8790 }, { "epoch": 1.76, "grad_norm": 1.5073894262313843, "learning_rate": 4.362945813348956e-06, "loss": 0.2934, "step": 8800 }, { "epoch": 1.762, "grad_norm": 1.476199746131897, "learning_rate": 4.351406849603452e-06, "loss": 0.3143, "step": 8810 }, { "epoch": 1.764, "grad_norm": 1.2955067157745361, "learning_rate": 4.3398713982612e-06, "loss": 0.3358, "step": 8820 }, { "epoch": 1.766, "grad_norm": 1.4470880031585693, "learning_rate": 4.328339521791493e-06, "loss": 0.3117, "step": 8830 }, { "epoch": 1.768, "grad_norm": 1.264491319656372, "learning_rate": 4.316811282644265e-06, "loss": 0.3316, "step": 8840 }, { "epoch": 1.77, "grad_norm": 0.9871441125869751, "learning_rate": 4.305286743249756e-06, "loss": 0.3224, "step": 8850 }, { "epoch": 1.772, "grad_norm": 1.7321498394012451, "learning_rate": 4.293765966018167e-06, "loss": 0.3177, "step": 8860 }, { "epoch": 1.774, "grad_norm": 1.4253642559051514, "learning_rate": 4.282249013339328e-06, "loss": 0.396, "step": 8870 }, { "epoch": 1.776, "grad_norm": 1.1400525569915771, "learning_rate": 4.270735947582352e-06, "loss": 0.3207, "step": 8880 }, { "epoch": 1.778, "grad_norm": 1.3001306056976318, "learning_rate": 4.259226831095311e-06, "loss": 0.3579, "step": 8890 }, { "epoch": 1.78, "grad_norm": 0.9259517788887024, "learning_rate": 4.247721726204883e-06, "loss": 0.3522, "step": 8900 }, { "epoch": 1.782, "grad_norm": 1.352211594581604, "learning_rate": 4.236220695216024e-06, "loss": 0.3184, "step": 8910 }, { "epoch": 1.784, "grad_norm": 1.4260315895080566, "learning_rate": 4.224723800411631e-06, "loss": 0.3632, "step": 8920 }, { "epoch": 1.786, "grad_norm": 1.3724855184555054, "learning_rate": 4.2132311040521975e-06, "loss": 0.314, "step": 8930 }, { "epoch": 1.788, "grad_norm": 1.4513239860534668, "learning_rate": 4.201742668375481e-06, "loss": 0.3224, "step": 8940 }, { "epoch": 1.79, "grad_norm": 1.2112324237823486, "learning_rate": 4.190258555596168e-06, "loss": 0.3068, "step": 8950 }, { "epoch": 1.792, "grad_norm": 1.5572727918624878, "learning_rate": 4.178778827905535e-06, "loss": 0.3771, "step": 8960 }, { "epoch": 1.794, "grad_norm": 0.9895251989364624, "learning_rate": 4.167303547471108e-06, "loss": 0.3205, "step": 8970 }, { "epoch": 1.796, "grad_norm": 1.2107610702514648, "learning_rate": 4.155832776436331e-06, "loss": 0.353, "step": 8980 }, { "epoch": 1.798, "grad_norm": 1.2437814474105835, "learning_rate": 4.14436657692023e-06, "loss": 0.3566, "step": 8990 }, { "epoch": 1.8, "grad_norm": 1.1405911445617676, "learning_rate": 4.132905011017071e-06, "loss": 0.3001, "step": 9000 }, { "epoch": 1.802, "grad_norm": 1.2086176872253418, "learning_rate": 4.121448140796029e-06, "loss": 0.3399, "step": 9010 }, { "epoch": 1.804, "grad_norm": 1.3747698068618774, "learning_rate": 4.109996028300847e-06, "loss": 0.3545, "step": 9020 }, { "epoch": 1.806, "grad_norm": 1.1939738988876343, "learning_rate": 4.098548735549508e-06, "loss": 0.3397, "step": 9030 }, { "epoch": 1.808, "grad_norm": 1.5216190814971924, "learning_rate": 4.087106324533891e-06, "loss": 0.3071, "step": 9040 }, { "epoch": 1.81, "grad_norm": 1.557185411453247, "learning_rate": 4.075668857219436e-06, "loss": 0.2898, "step": 9050 }, { "epoch": 1.812, "grad_norm": 1.694498896598816, "learning_rate": 4.0642363955448175e-06, "loss": 0.3705, "step": 9060 }, { "epoch": 1.814, "grad_norm": 1.7340755462646484, "learning_rate": 4.052809001421595e-06, "loss": 0.3218, "step": 9070 }, { "epoch": 1.8159999999999998, "grad_norm": 1.123704195022583, "learning_rate": 4.041386736733889e-06, "loss": 0.3743, "step": 9080 }, { "epoch": 1.818, "grad_norm": 1.6737864017486572, "learning_rate": 4.029969663338042e-06, "loss": 0.3169, "step": 9090 }, { "epoch": 1.8199999999999998, "grad_norm": 1.1411502361297607, "learning_rate": 4.018557843062282e-06, "loss": 0.2783, "step": 9100 }, { "epoch": 1.822, "grad_norm": 1.4459763765335083, "learning_rate": 4.007151337706391e-06, "loss": 0.3288, "step": 9110 }, { "epoch": 1.8239999999999998, "grad_norm": 1.1251758337020874, "learning_rate": 3.995750209041365e-06, "loss": 0.2966, "step": 9120 }, { "epoch": 1.826, "grad_norm": 1.534656286239624, "learning_rate": 3.98435451880909e-06, "loss": 0.3306, "step": 9130 }, { "epoch": 1.8279999999999998, "grad_norm": 1.2617437839508057, "learning_rate": 3.972964328721992e-06, "loss": 0.2749, "step": 9140 }, { "epoch": 1.83, "grad_norm": 1.1563127040863037, "learning_rate": 3.961579700462715e-06, "loss": 0.348, "step": 9150 }, { "epoch": 1.8319999999999999, "grad_norm": 1.5472575426101685, "learning_rate": 3.950200695683788e-06, "loss": 0.297, "step": 9160 }, { "epoch": 1.834, "grad_norm": 1.342626690864563, "learning_rate": 3.938827376007281e-06, "loss": 0.3328, "step": 9170 }, { "epoch": 1.8359999999999999, "grad_norm": 1.0215874910354614, "learning_rate": 3.927459803024475e-06, "loss": 0.3094, "step": 9180 }, { "epoch": 1.838, "grad_norm": 1.6598362922668457, "learning_rate": 3.9160980382955336e-06, "loss": 0.3091, "step": 9190 }, { "epoch": 1.8399999999999999, "grad_norm": 1.050584077835083, "learning_rate": 3.904742143349169e-06, "loss": 0.3078, "step": 9200 }, { "epoch": 1.842, "grad_norm": 1.560577630996704, "learning_rate": 3.893392179682304e-06, "loss": 0.323, "step": 9210 }, { "epoch": 1.8439999999999999, "grad_norm": 1.0825576782226562, "learning_rate": 3.882048208759735e-06, "loss": 0.3313, "step": 9220 }, { "epoch": 1.846, "grad_norm": 1.0511835813522339, "learning_rate": 3.870710292013815e-06, "loss": 0.3295, "step": 9230 }, { "epoch": 1.8479999999999999, "grad_norm": 1.3847706317901611, "learning_rate": 3.859378490844104e-06, "loss": 0.3417, "step": 9240 }, { "epoch": 1.85, "grad_norm": 1.1455384492874146, "learning_rate": 3.8480528666170495e-06, "loss": 0.3394, "step": 9250 }, { "epoch": 1.8519999999999999, "grad_norm": 1.5466185808181763, "learning_rate": 3.836733480665637e-06, "loss": 0.3222, "step": 9260 }, { "epoch": 1.854, "grad_norm": 1.2878413200378418, "learning_rate": 3.825420394289085e-06, "loss": 0.3517, "step": 9270 }, { "epoch": 1.8559999999999999, "grad_norm": 0.8397305011749268, "learning_rate": 3.814113668752486e-06, "loss": 0.3493, "step": 9280 }, { "epoch": 1.858, "grad_norm": 1.3638249635696411, "learning_rate": 3.8028133652864872e-06, "loss": 0.4023, "step": 9290 }, { "epoch": 1.8599999999999999, "grad_norm": 1.1861424446105957, "learning_rate": 3.791519545086963e-06, "loss": 0.3283, "step": 9300 }, { "epoch": 1.862, "grad_norm": 1.501705527305603, "learning_rate": 3.7802322693146726e-06, "loss": 0.3947, "step": 9310 }, { "epoch": 1.8639999999999999, "grad_norm": 1.1663841009140015, "learning_rate": 3.7689515990949364e-06, "loss": 0.3183, "step": 9320 }, { "epoch": 1.866, "grad_norm": 1.418528437614441, "learning_rate": 3.757677595517302e-06, "loss": 0.2897, "step": 9330 }, { "epoch": 1.8679999999999999, "grad_norm": 1.609471321105957, "learning_rate": 3.7464103196352176e-06, "loss": 0.3356, "step": 9340 }, { "epoch": 1.87, "grad_norm": 1.3416111469268799, "learning_rate": 3.7351498324656944e-06, "loss": 0.3206, "step": 9350 }, { "epoch": 1.8719999999999999, "grad_norm": 1.485053539276123, "learning_rate": 3.7238961949889796e-06, "loss": 0.3133, "step": 9360 }, { "epoch": 1.874, "grad_norm": 1.3999130725860596, "learning_rate": 3.7126494681482317e-06, "loss": 0.367, "step": 9370 }, { "epoch": 1.876, "grad_norm": 1.3190253973007202, "learning_rate": 3.70140971284918e-06, "loss": 0.3881, "step": 9380 }, { "epoch": 1.8780000000000001, "grad_norm": 1.468934416770935, "learning_rate": 3.690176989959801e-06, "loss": 0.3039, "step": 9390 }, { "epoch": 1.88, "grad_norm": 1.4125316143035889, "learning_rate": 3.678951360309988e-06, "loss": 0.3603, "step": 9400 }, { "epoch": 1.8820000000000001, "grad_norm": 1.2523292303085327, "learning_rate": 3.6677328846912237e-06, "loss": 0.3139, "step": 9410 }, { "epoch": 1.884, "grad_norm": 1.6310887336730957, "learning_rate": 3.6565216238562464e-06, "loss": 0.342, "step": 9420 }, { "epoch": 1.8860000000000001, "grad_norm": 1.028517723083496, "learning_rate": 3.645317638518721e-06, "loss": 0.3088, "step": 9430 }, { "epoch": 1.888, "grad_norm": 1.4186135530471802, "learning_rate": 3.6341209893529195e-06, "loss": 0.3238, "step": 9440 }, { "epoch": 1.8900000000000001, "grad_norm": 1.2561564445495605, "learning_rate": 3.6229317369933786e-06, "loss": 0.3813, "step": 9450 }, { "epoch": 1.892, "grad_norm": 1.3972420692443848, "learning_rate": 3.61174994203458e-06, "loss": 0.3561, "step": 9460 }, { "epoch": 1.8940000000000001, "grad_norm": 1.1440085172653198, "learning_rate": 3.6005756650306258e-06, "loss": 0.2912, "step": 9470 }, { "epoch": 1.896, "grad_norm": 1.243796944618225, "learning_rate": 3.589408966494897e-06, "loss": 0.2798, "step": 9480 }, { "epoch": 1.8980000000000001, "grad_norm": 1.2843304872512817, "learning_rate": 3.5782499068997386e-06, "loss": 0.3649, "step": 9490 }, { "epoch": 1.9, "grad_norm": 1.2538890838623047, "learning_rate": 3.5670985466761243e-06, "loss": 0.3573, "step": 9500 }, { "epoch": 1.9020000000000001, "grad_norm": 1.5263408422470093, "learning_rate": 3.5559549462133407e-06, "loss": 0.3468, "step": 9510 }, { "epoch": 1.904, "grad_norm": 1.2119961977005005, "learning_rate": 3.5448191658586423e-06, "loss": 0.2936, "step": 9520 }, { "epoch": 1.9060000000000001, "grad_norm": 1.630476474761963, "learning_rate": 3.5336912659169366e-06, "loss": 0.3447, "step": 9530 }, { "epoch": 1.908, "grad_norm": 1.7474474906921387, "learning_rate": 3.522571306650462e-06, "loss": 0.3666, "step": 9540 }, { "epoch": 1.9100000000000001, "grad_norm": 2.313354730606079, "learning_rate": 3.511459348278448e-06, "loss": 0.3717, "step": 9550 }, { "epoch": 1.912, "grad_norm": 1.660021185874939, "learning_rate": 3.5003554509767966e-06, "loss": 0.3335, "step": 9560 }, { "epoch": 1.9140000000000001, "grad_norm": 1.404435396194458, "learning_rate": 3.4892596748777563e-06, "loss": 0.3466, "step": 9570 }, { "epoch": 1.916, "grad_norm": 1.2691371440887451, "learning_rate": 3.4781720800696006e-06, "loss": 0.3342, "step": 9580 }, { "epoch": 1.9180000000000001, "grad_norm": 1.8161771297454834, "learning_rate": 3.4670927265962908e-06, "loss": 0.3479, "step": 9590 }, { "epoch": 1.92, "grad_norm": 1.5305116176605225, "learning_rate": 3.4560216744571607e-06, "loss": 0.3067, "step": 9600 }, { "epoch": 1.9220000000000002, "grad_norm": 1.5921387672424316, "learning_rate": 3.444958983606592e-06, "loss": 0.3088, "step": 9610 }, { "epoch": 1.924, "grad_norm": 1.490713119506836, "learning_rate": 3.433904713953682e-06, "loss": 0.3352, "step": 9620 }, { "epoch": 1.9260000000000002, "grad_norm": 1.590400218963623, "learning_rate": 3.4228589253619247e-06, "loss": 0.3203, "step": 9630 }, { "epoch": 1.928, "grad_norm": 1.140105128288269, "learning_rate": 3.411821677648887e-06, "loss": 0.3612, "step": 9640 }, { "epoch": 1.9300000000000002, "grad_norm": 1.3013497591018677, "learning_rate": 3.400793030585884e-06, "loss": 0.277, "step": 9650 }, { "epoch": 1.932, "grad_norm": 1.3757359981536865, "learning_rate": 3.389773043897652e-06, "loss": 0.3651, "step": 9660 }, { "epoch": 1.9340000000000002, "grad_norm": 1.2634108066558838, "learning_rate": 3.378761777262028e-06, "loss": 0.3305, "step": 9670 }, { "epoch": 1.936, "grad_norm": 1.5106260776519775, "learning_rate": 3.36775929030963e-06, "loss": 0.3067, "step": 9680 }, { "epoch": 1.938, "grad_norm": 1.1745408773422241, "learning_rate": 3.3567656426235275e-06, "loss": 0.364, "step": 9690 }, { "epoch": 1.94, "grad_norm": 1.7287566661834717, "learning_rate": 3.34578089373892e-06, "loss": 0.3663, "step": 9700 }, { "epoch": 1.942, "grad_norm": 1.4090641736984253, "learning_rate": 3.3348051031428184e-06, "loss": 0.2921, "step": 9710 }, { "epoch": 1.944, "grad_norm": 1.6581404209136963, "learning_rate": 3.323838330273723e-06, "loss": 0.4043, "step": 9720 }, { "epoch": 1.946, "grad_norm": 1.7018346786499023, "learning_rate": 3.312880634521295e-06, "loss": 0.3818, "step": 9730 }, { "epoch": 1.948, "grad_norm": 1.4111535549163818, "learning_rate": 3.301932075226041e-06, "loss": 0.33, "step": 9740 }, { "epoch": 1.95, "grad_norm": 1.4237406253814697, "learning_rate": 3.2909927116789908e-06, "loss": 0.3326, "step": 9750 }, { "epoch": 1.952, "grad_norm": 1.3221118450164795, "learning_rate": 3.280062603121373e-06, "loss": 0.2659, "step": 9760 }, { "epoch": 1.954, "grad_norm": 1.3768141269683838, "learning_rate": 3.2691418087442995e-06, "loss": 0.3813, "step": 9770 }, { "epoch": 1.956, "grad_norm": 1.5262187719345093, "learning_rate": 3.2582303876884406e-06, "loss": 0.3554, "step": 9780 }, { "epoch": 1.958, "grad_norm": 1.184778094291687, "learning_rate": 3.247328399043706e-06, "loss": 0.2866, "step": 9790 }, { "epoch": 1.96, "grad_norm": 1.3611551523208618, "learning_rate": 3.2364359018489245e-06, "loss": 0.3865, "step": 9800 }, { "epoch": 1.962, "grad_norm": 1.296720027923584, "learning_rate": 3.2255529550915242e-06, "loss": 0.3272, "step": 9810 }, { "epoch": 1.964, "grad_norm": 1.1612666845321655, "learning_rate": 3.2146796177072183e-06, "loss": 0.2417, "step": 9820 }, { "epoch": 1.966, "grad_norm": 1.3343042135238647, "learning_rate": 3.203815948579674e-06, "loss": 0.2749, "step": 9830 }, { "epoch": 1.968, "grad_norm": 1.5263704061508179, "learning_rate": 3.192962006540205e-06, "loss": 0.2788, "step": 9840 }, { "epoch": 1.97, "grad_norm": 1.9184904098510742, "learning_rate": 3.1821178503674515e-06, "loss": 0.2875, "step": 9850 }, { "epoch": 1.972, "grad_norm": 1.6000325679779053, "learning_rate": 3.1712835387870527e-06, "loss": 0.3293, "step": 9860 }, { "epoch": 1.974, "grad_norm": 1.1482776403427124, "learning_rate": 3.1604591304713394e-06, "loss": 0.3441, "step": 9870 }, { "epoch": 1.976, "grad_norm": 1.1263827085494995, "learning_rate": 3.149644684039008e-06, "loss": 0.3172, "step": 9880 }, { "epoch": 1.978, "grad_norm": 1.3861206769943237, "learning_rate": 3.1388402580548154e-06, "loss": 0.3633, "step": 9890 }, { "epoch": 1.98, "grad_norm": 1.7515361309051514, "learning_rate": 3.1280459110292474e-06, "loss": 0.3882, "step": 9900 }, { "epoch": 1.982, "grad_norm": 1.409308671951294, "learning_rate": 3.117261701418204e-06, "loss": 0.3439, "step": 9910 }, { "epoch": 1.984, "grad_norm": 1.6333212852478027, "learning_rate": 3.106487687622697e-06, "loss": 0.3062, "step": 9920 }, { "epoch": 1.986, "grad_norm": 1.5096412897109985, "learning_rate": 3.095723927988517e-06, "loss": 0.3322, "step": 9930 }, { "epoch": 1.988, "grad_norm": 0.9250149726867676, "learning_rate": 3.0849704808059266e-06, "loss": 0.3363, "step": 9940 }, { "epoch": 1.99, "grad_norm": 1.4919676780700684, "learning_rate": 3.074227404309336e-06, "loss": 0.3459, "step": 9950 }, { "epoch": 1.992, "grad_norm": 1.1677219867706299, "learning_rate": 3.063494756677005e-06, "loss": 0.3087, "step": 9960 }, { "epoch": 1.994, "grad_norm": 1.3163172006607056, "learning_rate": 3.0527725960307083e-06, "loss": 0.3544, "step": 9970 }, { "epoch": 1.996, "grad_norm": 1.1766602993011475, "learning_rate": 3.0420609804354295e-06, "loss": 0.2968, "step": 9980 }, { "epoch": 1.998, "grad_norm": 1.754724144935608, "learning_rate": 3.0313599678990514e-06, "loss": 0.3324, "step": 9990 }, { "epoch": 2.0, "grad_norm": 1.334686279296875, "learning_rate": 3.0206696163720317e-06, "loss": 0.3427, "step": 10000 }, { "epoch": 2.002, "grad_norm": 1.149556279182434, "learning_rate": 3.0099899837470976e-06, "loss": 0.2296, "step": 10010 }, { "epoch": 2.004, "grad_norm": 2.647596597671509, "learning_rate": 2.999321127858925e-06, "loss": 0.1997, "step": 10020 }, { "epoch": 2.006, "grad_norm": 1.3816395998001099, "learning_rate": 2.9886631064838355e-06, "loss": 0.1688, "step": 10030 }, { "epoch": 2.008, "grad_norm": 1.5958433151245117, "learning_rate": 2.9780159773394713e-06, "loss": 0.1862, "step": 10040 }, { "epoch": 2.01, "grad_norm": 1.7773202657699585, "learning_rate": 2.96737979808449e-06, "loss": 0.1681, "step": 10050 }, { "epoch": 2.012, "grad_norm": 1.0190283060073853, "learning_rate": 2.9567546263182554e-06, "loss": 0.1732, "step": 10060 }, { "epoch": 2.014, "grad_norm": 1.5294930934906006, "learning_rate": 2.9461405195805146e-06, "loss": 0.1918, "step": 10070 }, { "epoch": 2.016, "grad_norm": 1.375820517539978, "learning_rate": 2.9355375353510973e-06, "loss": 0.1856, "step": 10080 }, { "epoch": 2.018, "grad_norm": 1.9150773286819458, "learning_rate": 2.9249457310495994e-06, "loss": 0.2228, "step": 10090 }, { "epoch": 2.02, "grad_norm": 1.4141868352890015, "learning_rate": 2.91436516403507e-06, "loss": 0.2042, "step": 10100 }, { "epoch": 2.022, "grad_norm": 1.665558099746704, "learning_rate": 2.9037958916057104e-06, "loss": 0.1844, "step": 10110 }, { "epoch": 2.024, "grad_norm": 1.4332926273345947, "learning_rate": 2.893237970998547e-06, "loss": 0.2012, "step": 10120 }, { "epoch": 2.026, "grad_norm": 1.575321078300476, "learning_rate": 2.8826914593891396e-06, "loss": 0.2173, "step": 10130 }, { "epoch": 2.028, "grad_norm": 1.420453667640686, "learning_rate": 2.872156413891263e-06, "loss": 0.1859, "step": 10140 }, { "epoch": 2.03, "grad_norm": 1.3123276233673096, "learning_rate": 2.8616328915565907e-06, "loss": 0.1539, "step": 10150 }, { "epoch": 2.032, "grad_norm": 2.069629430770874, "learning_rate": 2.8511209493744015e-06, "loss": 0.1664, "step": 10160 }, { "epoch": 2.034, "grad_norm": 1.5814684629440308, "learning_rate": 2.8406206442712618e-06, "loss": 0.2212, "step": 10170 }, { "epoch": 2.036, "grad_norm": 1.5895538330078125, "learning_rate": 2.830132033110713e-06, "loss": 0.1419, "step": 10180 }, { "epoch": 2.038, "grad_norm": 1.795087456703186, "learning_rate": 2.8196551726929745e-06, "loss": 0.1644, "step": 10190 }, { "epoch": 2.04, "grad_norm": 1.6379607915878296, "learning_rate": 2.80919011975463e-06, "loss": 0.1866, "step": 10200 }, { "epoch": 2.042, "grad_norm": 1.2147035598754883, "learning_rate": 2.798736930968315e-06, "loss": 0.1596, "step": 10210 }, { "epoch": 2.044, "grad_norm": 1.3540210723876953, "learning_rate": 2.788295662942423e-06, "loss": 0.1673, "step": 10220 }, { "epoch": 2.046, "grad_norm": 1.2441480159759521, "learning_rate": 2.777866372220789e-06, "loss": 0.1877, "step": 10230 }, { "epoch": 2.048, "grad_norm": 2.1177780628204346, "learning_rate": 2.7674491152823825e-06, "loss": 0.2515, "step": 10240 }, { "epoch": 2.05, "grad_norm": 1.9897186756134033, "learning_rate": 2.7570439485410116e-06, "loss": 0.1739, "step": 10250 }, { "epoch": 2.052, "grad_norm": 2.053022861480713, "learning_rate": 2.7466509283450026e-06, "loss": 0.1839, "step": 10260 }, { "epoch": 2.054, "grad_norm": 1.2909846305847168, "learning_rate": 2.736270110976912e-06, "loss": 0.1838, "step": 10270 }, { "epoch": 2.056, "grad_norm": 1.4775480031967163, "learning_rate": 2.7259015526532074e-06, "loss": 0.1464, "step": 10280 }, { "epoch": 2.058, "grad_norm": 2.2092342376708984, "learning_rate": 2.7155453095239682e-06, "loss": 0.1622, "step": 10290 }, { "epoch": 2.06, "grad_norm": 1.2695140838623047, "learning_rate": 2.705201437672585e-06, "loss": 0.1665, "step": 10300 }, { "epoch": 2.062, "grad_norm": 1.4544492959976196, "learning_rate": 2.6948699931154533e-06, "loss": 0.158, "step": 10310 }, { "epoch": 2.064, "grad_norm": 1.6260836124420166, "learning_rate": 2.684551031801662e-06, "loss": 0.1697, "step": 10320 }, { "epoch": 2.066, "grad_norm": 1.1283328533172607, "learning_rate": 2.6742446096127086e-06, "loss": 0.1621, "step": 10330 }, { "epoch": 2.068, "grad_norm": 1.4716851711273193, "learning_rate": 2.66395078236218e-06, "loss": 0.1704, "step": 10340 }, { "epoch": 2.07, "grad_norm": 1.6541157960891724, "learning_rate": 2.6536696057954553e-06, "loss": 0.2194, "step": 10350 }, { "epoch": 2.072, "grad_norm": 1.5134286880493164, "learning_rate": 2.6434011355894074e-06, "loss": 0.185, "step": 10360 }, { "epoch": 2.074, "grad_norm": 1.936950922012329, "learning_rate": 2.633145427352102e-06, "loss": 0.1546, "step": 10370 }, { "epoch": 2.076, "grad_norm": 2.237762212753296, "learning_rate": 2.6229025366224835e-06, "loss": 0.1923, "step": 10380 }, { "epoch": 2.078, "grad_norm": 2.4793312549591064, "learning_rate": 2.612672518870093e-06, "loss": 0.1842, "step": 10390 }, { "epoch": 2.08, "grad_norm": 1.5138216018676758, "learning_rate": 2.602455429494758e-06, "loss": 0.1437, "step": 10400 }, { "epoch": 2.082, "grad_norm": 1.5739482641220093, "learning_rate": 2.5922513238262915e-06, "loss": 0.1765, "step": 10410 }, { "epoch": 2.084, "grad_norm": 1.1942251920700073, "learning_rate": 2.582060257124195e-06, "loss": 0.1793, "step": 10420 }, { "epoch": 2.086, "grad_norm": 1.485691785812378, "learning_rate": 2.5718822845773516e-06, "loss": 0.1927, "step": 10430 }, { "epoch": 2.088, "grad_norm": 1.794269323348999, "learning_rate": 2.5617174613037503e-06, "loss": 0.1779, "step": 10440 }, { "epoch": 2.09, "grad_norm": 1.6613082885742188, "learning_rate": 2.5515658423501573e-06, "loss": 0.1955, "step": 10450 }, { "epoch": 2.092, "grad_norm": 1.5060651302337646, "learning_rate": 2.541427482691832e-06, "loss": 0.1463, "step": 10460 }, { "epoch": 2.094, "grad_norm": 1.6522201299667358, "learning_rate": 2.5313024372322413e-06, "loss": 0.2071, "step": 10470 }, { "epoch": 2.096, "grad_norm": 2.138770818710327, "learning_rate": 2.5211907608027366e-06, "loss": 0.182, "step": 10480 }, { "epoch": 2.098, "grad_norm": 2.109570264816284, "learning_rate": 2.5110925081622796e-06, "loss": 0.1743, "step": 10490 }, { "epoch": 2.1, "grad_norm": 1.1957253217697144, "learning_rate": 2.5010077339971283e-06, "loss": 0.1715, "step": 10500 }, { "epoch": 2.102, "grad_norm": 1.4688587188720703, "learning_rate": 2.4909364929205575e-06, "loss": 0.1838, "step": 10510 }, { "epoch": 2.104, "grad_norm": 1.1424540281295776, "learning_rate": 2.480878839472552e-06, "loss": 0.1815, "step": 10520 }, { "epoch": 2.106, "grad_norm": 1.2021138668060303, "learning_rate": 2.470834828119509e-06, "loss": 0.1657, "step": 10530 }, { "epoch": 2.108, "grad_norm": 1.8013497591018677, "learning_rate": 2.4608045132539536e-06, "loss": 0.2006, "step": 10540 }, { "epoch": 2.11, "grad_norm": 2.014941930770874, "learning_rate": 2.4507879491942388e-06, "loss": 0.1859, "step": 10550 }, { "epoch": 2.112, "grad_norm": 1.798327922821045, "learning_rate": 2.4407851901842465e-06, "loss": 0.1798, "step": 10560 }, { "epoch": 2.114, "grad_norm": 1.7324855327606201, "learning_rate": 2.4307962903931025e-06, "loss": 0.1823, "step": 10570 }, { "epoch": 2.116, "grad_norm": 1.536121129989624, "learning_rate": 2.4208213039148803e-06, "loss": 0.1776, "step": 10580 }, { "epoch": 2.118, "grad_norm": 2.203296422958374, "learning_rate": 2.4108602847683012e-06, "loss": 0.2172, "step": 10590 }, { "epoch": 2.12, "grad_norm": 1.6374783515930176, "learning_rate": 2.4009132868964525e-06, "loss": 0.1755, "step": 10600 }, { "epoch": 2.122, "grad_norm": 2.1419484615325928, "learning_rate": 2.3909803641664907e-06, "loss": 0.1998, "step": 10610 }, { "epoch": 2.124, "grad_norm": 1.4426078796386719, "learning_rate": 2.3810615703693446e-06, "loss": 0.207, "step": 10620 }, { "epoch": 2.126, "grad_norm": 1.7481943368911743, "learning_rate": 2.3711569592194363e-06, "loss": 0.1504, "step": 10630 }, { "epoch": 2.128, "grad_norm": 1.914016604423523, "learning_rate": 2.3612665843543737e-06, "loss": 0.1743, "step": 10640 }, { "epoch": 2.13, "grad_norm": 1.829076886177063, "learning_rate": 2.3513904993346775e-06, "loss": 0.2314, "step": 10650 }, { "epoch": 2.132, "grad_norm": 1.4599560499191284, "learning_rate": 2.3415287576434807e-06, "loss": 0.1967, "step": 10660 }, { "epoch": 2.134, "grad_norm": 1.5102792978286743, "learning_rate": 2.3316814126862377e-06, "loss": 0.1772, "step": 10670 }, { "epoch": 2.136, "grad_norm": 1.5047998428344727, "learning_rate": 2.321848517790442e-06, "loss": 0.2231, "step": 10680 }, { "epoch": 2.138, "grad_norm": 1.4379587173461914, "learning_rate": 2.312030126205335e-06, "loss": 0.1765, "step": 10690 }, { "epoch": 2.14, "grad_norm": 2.076664447784424, "learning_rate": 2.302226291101609e-06, "loss": 0.1563, "step": 10700 }, { "epoch": 2.142, "grad_norm": 1.7419641017913818, "learning_rate": 2.2924370655711407e-06, "loss": 0.1456, "step": 10710 }, { "epoch": 2.144, "grad_norm": 1.5634517669677734, "learning_rate": 2.282662502626678e-06, "loss": 0.1508, "step": 10720 }, { "epoch": 2.146, "grad_norm": 2.313948631286621, "learning_rate": 2.2729026552015653e-06, "loss": 0.1975, "step": 10730 }, { "epoch": 2.148, "grad_norm": 1.3335062265396118, "learning_rate": 2.263157576149463e-06, "loss": 0.1611, "step": 10740 }, { "epoch": 2.15, "grad_norm": 1.4061542749404907, "learning_rate": 2.2534273182440515e-06, "loss": 0.1353, "step": 10750 }, { "epoch": 2.152, "grad_norm": 2.1555397510528564, "learning_rate": 2.2437119341787444e-06, "loss": 0.1941, "step": 10760 }, { "epoch": 2.154, "grad_norm": 1.6966259479522705, "learning_rate": 2.2340114765664137e-06, "loss": 0.1828, "step": 10770 }, { "epoch": 2.156, "grad_norm": 1.6515167951583862, "learning_rate": 2.224325997939095e-06, "loss": 0.1654, "step": 10780 }, { "epoch": 2.158, "grad_norm": 1.68687105178833, "learning_rate": 2.214655550747709e-06, "loss": 0.1473, "step": 10790 }, { "epoch": 2.16, "grad_norm": 1.4554318189620972, "learning_rate": 2.2050001873617716e-06, "loss": 0.1534, "step": 10800 }, { "epoch": 2.162, "grad_norm": 1.4342589378356934, "learning_rate": 2.19535996006911e-06, "loss": 0.1541, "step": 10810 }, { "epoch": 2.164, "grad_norm": 1.8129370212554932, "learning_rate": 2.1857349210755956e-06, "loss": 0.1739, "step": 10820 }, { "epoch": 2.166, "grad_norm": 1.8690457344055176, "learning_rate": 2.1761251225048385e-06, "loss": 0.1665, "step": 10830 }, { "epoch": 2.168, "grad_norm": 1.6199536323547363, "learning_rate": 2.1665306163979132e-06, "loss": 0.141, "step": 10840 }, { "epoch": 2.17, "grad_norm": 1.7210586071014404, "learning_rate": 2.156951454713093e-06, "loss": 0.1729, "step": 10850 }, { "epoch": 2.172, "grad_norm": 2.0658435821533203, "learning_rate": 2.147387689325539e-06, "loss": 0.2072, "step": 10860 }, { "epoch": 2.174, "grad_norm": 2.763272762298584, "learning_rate": 2.137839372027047e-06, "loss": 0.2201, "step": 10870 }, { "epoch": 2.176, "grad_norm": 1.9413820505142212, "learning_rate": 2.1283065545257443e-06, "loss": 0.1413, "step": 10880 }, { "epoch": 2.178, "grad_norm": 1.5634864568710327, "learning_rate": 2.118789288445829e-06, "loss": 0.1497, "step": 10890 }, { "epoch": 2.18, "grad_norm": 2.5157878398895264, "learning_rate": 2.1092876253272793e-06, "loss": 0.1919, "step": 10900 }, { "epoch": 2.182, "grad_norm": 1.4105507135391235, "learning_rate": 2.099801616625573e-06, "loss": 0.1154, "step": 10910 }, { "epoch": 2.184, "grad_norm": 1.6929118633270264, "learning_rate": 2.090331313711417e-06, "loss": 0.1904, "step": 10920 }, { "epoch": 2.186, "grad_norm": 2.1146175861358643, "learning_rate": 2.080876767870466e-06, "loss": 0.191, "step": 10930 }, { "epoch": 2.188, "grad_norm": 1.6462335586547852, "learning_rate": 2.0714380303030373e-06, "loss": 0.2233, "step": 10940 }, { "epoch": 2.19, "grad_norm": 2.0763607025146484, "learning_rate": 2.0620151521238453e-06, "loss": 0.1847, "step": 10950 }, { "epoch": 2.192, "grad_norm": 1.8710356950759888, "learning_rate": 2.0526081843617183e-06, "loss": 0.1867, "step": 10960 }, { "epoch": 2.194, "grad_norm": 1.8661998510360718, "learning_rate": 2.04321717795932e-06, "loss": 0.1691, "step": 10970 }, { "epoch": 2.196, "grad_norm": 1.706262230873108, "learning_rate": 2.03384218377288e-06, "loss": 0.1848, "step": 10980 }, { "epoch": 2.198, "grad_norm": 1.395787239074707, "learning_rate": 2.0244832525719155e-06, "loss": 0.1661, "step": 10990 }, { "epoch": 2.2, "grad_norm": 1.469756841659546, "learning_rate": 2.015140435038951e-06, "loss": 0.1701, "step": 11000 }, { "epoch": 2.202, "grad_norm": 2.2943742275238037, "learning_rate": 2.005813781769253e-06, "loss": 0.159, "step": 11010 }, { "epoch": 2.204, "grad_norm": 2.527089834213257, "learning_rate": 1.996503343270554e-06, "loss": 0.2051, "step": 11020 }, { "epoch": 2.206, "grad_norm": 2.241828680038452, "learning_rate": 1.987209169962769e-06, "loss": 0.1856, "step": 11030 }, { "epoch": 2.208, "grad_norm": 2.044684410095215, "learning_rate": 1.9779313121777382e-06, "loss": 0.1704, "step": 11040 }, { "epoch": 2.21, "grad_norm": 1.545622706413269, "learning_rate": 1.9686698201589395e-06, "loss": 0.1666, "step": 11050 }, { "epoch": 2.212, "grad_norm": 2.001255750656128, "learning_rate": 1.9594247440612293e-06, "loss": 0.1813, "step": 11060 }, { "epoch": 2.214, "grad_norm": 2.0734426975250244, "learning_rate": 1.9501961339505626e-06, "loss": 0.2223, "step": 11070 }, { "epoch": 2.216, "grad_norm": 1.9134544134140015, "learning_rate": 1.94098403980372e-06, "loss": 0.1919, "step": 11080 }, { "epoch": 2.218, "grad_norm": 1.5950722694396973, "learning_rate": 1.9317885115080514e-06, "loss": 0.234, "step": 11090 }, { "epoch": 2.22, "grad_norm": 1.8253746032714844, "learning_rate": 1.922609598861187e-06, "loss": 0.1471, "step": 11100 }, { "epoch": 2.222, "grad_norm": 1.5433813333511353, "learning_rate": 1.913447351570776e-06, "loss": 0.1804, "step": 11110 }, { "epoch": 2.224, "grad_norm": 2.1074371337890625, "learning_rate": 1.9043018192542228e-06, "loss": 0.2, "step": 11120 }, { "epoch": 2.226, "grad_norm": 1.8115590810775757, "learning_rate": 1.8951730514384103e-06, "loss": 0.2082, "step": 11130 }, { "epoch": 2.228, "grad_norm": 0.8639247417449951, "learning_rate": 1.8860610975594384e-06, "loss": 0.1944, "step": 11140 }, { "epoch": 2.23, "grad_norm": 1.839328408241272, "learning_rate": 1.8769660069623448e-06, "loss": 0.212, "step": 11150 }, { "epoch": 2.232, "grad_norm": 1.7485737800598145, "learning_rate": 1.8678878289008511e-06, "loss": 0.2095, "step": 11160 }, { "epoch": 2.234, "grad_norm": 1.7731024026870728, "learning_rate": 1.8588266125370929e-06, "loss": 0.1836, "step": 11170 }, { "epoch": 2.2359999999999998, "grad_norm": 1.8955568075180054, "learning_rate": 1.8497824069413445e-06, "loss": 0.2007, "step": 11180 }, { "epoch": 2.238, "grad_norm": 1.6335498094558716, "learning_rate": 1.84075526109176e-06, "loss": 0.1826, "step": 11190 }, { "epoch": 2.24, "grad_norm": 1.6790274381637573, "learning_rate": 1.831745223874118e-06, "loss": 0.1806, "step": 11200 }, { "epoch": 2.242, "grad_norm": 1.6211411952972412, "learning_rate": 1.8227523440815331e-06, "loss": 0.1884, "step": 11210 }, { "epoch": 2.2439999999999998, "grad_norm": 1.6547083854675293, "learning_rate": 1.8137766704142141e-06, "loss": 0.1747, "step": 11220 }, { "epoch": 2.246, "grad_norm": 1.8069316148757935, "learning_rate": 1.8048182514791901e-06, "loss": 0.1676, "step": 11230 }, { "epoch": 2.248, "grad_norm": 2.3242225646972656, "learning_rate": 1.7958771357900446e-06, "loss": 0.1868, "step": 11240 }, { "epoch": 2.25, "grad_norm": 2.2176382541656494, "learning_rate": 1.7869533717666626e-06, "loss": 0.1645, "step": 11250 }, { "epoch": 2.252, "grad_norm": 1.437696933746338, "learning_rate": 1.7780470077349566e-06, "loss": 0.1623, "step": 11260 }, { "epoch": 2.254, "grad_norm": 2.0108892917633057, "learning_rate": 1.769158091926615e-06, "loss": 0.2113, "step": 11270 }, { "epoch": 2.2560000000000002, "grad_norm": 1.2373582124710083, "learning_rate": 1.760286672478837e-06, "loss": 0.1587, "step": 11280 }, { "epoch": 2.258, "grad_norm": 1.3458646535873413, "learning_rate": 1.751432797434068e-06, "loss": 0.2014, "step": 11290 }, { "epoch": 2.26, "grad_norm": 1.4614745378494263, "learning_rate": 1.7425965147397462e-06, "loss": 0.1785, "step": 11300 }, { "epoch": 2.262, "grad_norm": 2.939737558364868, "learning_rate": 1.7337778722480413e-06, "loss": 0.2342, "step": 11310 }, { "epoch": 2.2640000000000002, "grad_norm": 1.6198670864105225, "learning_rate": 1.7249769177155879e-06, "loss": 0.1486, "step": 11320 }, { "epoch": 2.266, "grad_norm": 2.2954392433166504, "learning_rate": 1.7161936988032386e-06, "loss": 0.196, "step": 11330 }, { "epoch": 2.268, "grad_norm": 1.6478867530822754, "learning_rate": 1.7074282630757998e-06, "loss": 0.1795, "step": 11340 }, { "epoch": 2.27, "grad_norm": 1.3108015060424805, "learning_rate": 1.6986806580017695e-06, "loss": 0.1213, "step": 11350 }, { "epoch": 2.2720000000000002, "grad_norm": 1.4279754161834717, "learning_rate": 1.689950930953091e-06, "loss": 0.1663, "step": 11360 }, { "epoch": 2.274, "grad_norm": 1.3529280424118042, "learning_rate": 1.6812391292048897e-06, "loss": 0.1478, "step": 11370 }, { "epoch": 2.276, "grad_norm": 2.1252269744873047, "learning_rate": 1.6725452999352137e-06, "loss": 0.1675, "step": 11380 }, { "epoch": 2.278, "grad_norm": 2.0060176849365234, "learning_rate": 1.6638694902247866e-06, "loss": 0.2113, "step": 11390 }, { "epoch": 2.2800000000000002, "grad_norm": 2.1761856079101562, "learning_rate": 1.655211747056749e-06, "loss": 0.1726, "step": 11400 }, { "epoch": 2.282, "grad_norm": 1.4057432413101196, "learning_rate": 1.6465721173164e-06, "loss": 0.1713, "step": 11410 }, { "epoch": 2.284, "grad_norm": 1.271407961845398, "learning_rate": 1.6379506477909518e-06, "loss": 0.173, "step": 11420 }, { "epoch": 2.286, "grad_norm": 1.5162497758865356, "learning_rate": 1.629347385169263e-06, "loss": 0.1821, "step": 11430 }, { "epoch": 2.288, "grad_norm": 1.081526517868042, "learning_rate": 1.6207623760416074e-06, "loss": 0.1428, "step": 11440 }, { "epoch": 2.29, "grad_norm": 1.603796362876892, "learning_rate": 1.6121956668993977e-06, "loss": 0.1669, "step": 11450 }, { "epoch": 2.292, "grad_norm": 1.392375111579895, "learning_rate": 1.6036473041349438e-06, "loss": 0.1698, "step": 11460 }, { "epoch": 2.294, "grad_norm": 1.9549520015716553, "learning_rate": 1.5951173340412134e-06, "loss": 0.2162, "step": 11470 }, { "epoch": 2.296, "grad_norm": 1.8508301973342896, "learning_rate": 1.58660580281156e-06, "loss": 0.1534, "step": 11480 }, { "epoch": 2.298, "grad_norm": 1.5918387174606323, "learning_rate": 1.5781127565394838e-06, "loss": 0.161, "step": 11490 }, { "epoch": 2.3, "grad_norm": 2.0230064392089844, "learning_rate": 1.5696382412183853e-06, "loss": 0.1709, "step": 11500 }, { "epoch": 2.302, "grad_norm": 2.3765511512756348, "learning_rate": 1.5611823027413109e-06, "loss": 0.1861, "step": 11510 }, { "epoch": 2.304, "grad_norm": 1.9833528995513916, "learning_rate": 1.5527449869007055e-06, "loss": 0.2067, "step": 11520 }, { "epoch": 2.306, "grad_norm": 1.4727070331573486, "learning_rate": 1.5443263393881619e-06, "loss": 0.1758, "step": 11530 }, { "epoch": 2.308, "grad_norm": 1.8662854433059692, "learning_rate": 1.535926405794179e-06, "loss": 0.1892, "step": 11540 }, { "epoch": 2.31, "grad_norm": 1.810899257659912, "learning_rate": 1.5275452316079143e-06, "loss": 0.1321, "step": 11550 }, { "epoch": 2.312, "grad_norm": 2.3489248752593994, "learning_rate": 1.519182862216929e-06, "loss": 0.1976, "step": 11560 }, { "epoch": 2.314, "grad_norm": 1.5641025304794312, "learning_rate": 1.5108393429069501e-06, "loss": 0.1718, "step": 11570 }, { "epoch": 2.316, "grad_norm": 2.0649514198303223, "learning_rate": 1.5025147188616308e-06, "loss": 0.1905, "step": 11580 }, { "epoch": 2.318, "grad_norm": 1.309147596359253, "learning_rate": 1.4942090351622884e-06, "loss": 0.185, "step": 11590 }, { "epoch": 2.32, "grad_norm": 1.3974589109420776, "learning_rate": 1.4859223367876762e-06, "loss": 0.1564, "step": 11600 }, { "epoch": 2.322, "grad_norm": 1.8548953533172607, "learning_rate": 1.477654668613735e-06, "loss": 0.1659, "step": 11610 }, { "epoch": 2.324, "grad_norm": 1.7054270505905151, "learning_rate": 1.469406075413342e-06, "loss": 0.2003, "step": 11620 }, { "epoch": 2.326, "grad_norm": 1.4067456722259521, "learning_rate": 1.4611766018560835e-06, "loss": 0.1569, "step": 11630 }, { "epoch": 2.328, "grad_norm": 2.1007652282714844, "learning_rate": 1.4529662925080023e-06, "loss": 0.1882, "step": 11640 }, { "epoch": 2.33, "grad_norm": 1.6421812772750854, "learning_rate": 1.4447751918313552e-06, "loss": 0.2184, "step": 11650 }, { "epoch": 2.332, "grad_norm": 2.9512548446655273, "learning_rate": 1.4366033441843823e-06, "loss": 0.1994, "step": 11660 }, { "epoch": 2.334, "grad_norm": 1.2109729051589966, "learning_rate": 1.4284507938210545e-06, "loss": 0.1702, "step": 11670 }, { "epoch": 2.336, "grad_norm": 1.767602801322937, "learning_rate": 1.420317584890844e-06, "loss": 0.1508, "step": 11680 }, { "epoch": 2.338, "grad_norm": 1.924599289894104, "learning_rate": 1.4122037614384814e-06, "loss": 0.2122, "step": 11690 }, { "epoch": 2.34, "grad_norm": 1.8495938777923584, "learning_rate": 1.404109367403712e-06, "loss": 0.1816, "step": 11700 }, { "epoch": 2.342, "grad_norm": 1.7605769634246826, "learning_rate": 1.3960344466210669e-06, "loss": 0.2041, "step": 11710 }, { "epoch": 2.344, "grad_norm": 2.3613359928131104, "learning_rate": 1.3879790428196226e-06, "loss": 0.1848, "step": 11720 }, { "epoch": 2.346, "grad_norm": 1.8871616125106812, "learning_rate": 1.3799431996227569e-06, "loss": 0.1771, "step": 11730 }, { "epoch": 2.348, "grad_norm": 1.523779034614563, "learning_rate": 1.3719269605479241e-06, "loss": 0.1653, "step": 11740 }, { "epoch": 2.35, "grad_norm": 1.7912616729736328, "learning_rate": 1.363930369006415e-06, "loss": 0.1834, "step": 11750 }, { "epoch": 2.352, "grad_norm": 2.330554485321045, "learning_rate": 1.3559534683031133e-06, "loss": 0.1694, "step": 11760 }, { "epoch": 2.354, "grad_norm": 1.3133292198181152, "learning_rate": 1.3479963016362768e-06, "loss": 0.1925, "step": 11770 }, { "epoch": 2.356, "grad_norm": 1.9684919118881226, "learning_rate": 1.3400589120972922e-06, "loss": 0.1448, "step": 11780 }, { "epoch": 2.358, "grad_norm": 1.2997586727142334, "learning_rate": 1.3321413426704426e-06, "loss": 0.2113, "step": 11790 }, { "epoch": 2.36, "grad_norm": 2.1797378063201904, "learning_rate": 1.3242436362326804e-06, "loss": 0.1585, "step": 11800 }, { "epoch": 2.362, "grad_norm": 1.4730124473571777, "learning_rate": 1.3163658355533866e-06, "loss": 0.1928, "step": 11810 }, { "epoch": 2.364, "grad_norm": 1.4231460094451904, "learning_rate": 1.3085079832941528e-06, "loss": 0.2012, "step": 11820 }, { "epoch": 2.366, "grad_norm": 1.8164182901382446, "learning_rate": 1.3006701220085338e-06, "loss": 0.1639, "step": 11830 }, { "epoch": 2.368, "grad_norm": 2.293713092803955, "learning_rate": 1.2928522941418241e-06, "loss": 0.168, "step": 11840 }, { "epoch": 2.37, "grad_norm": 1.6054863929748535, "learning_rate": 1.2850545420308386e-06, "loss": 0.1507, "step": 11850 }, { "epoch": 2.372, "grad_norm": 2.180938959121704, "learning_rate": 1.2772769079036639e-06, "loss": 0.1563, "step": 11860 }, { "epoch": 2.374, "grad_norm": 1.553425908088684, "learning_rate": 1.2695194338794414e-06, "loss": 0.1574, "step": 11870 }, { "epoch": 2.376, "grad_norm": 1.3529037237167358, "learning_rate": 1.2617821619681397e-06, "loss": 0.1941, "step": 11880 }, { "epoch": 2.378, "grad_norm": 1.9350522756576538, "learning_rate": 1.2540651340703231e-06, "loss": 0.1551, "step": 11890 }, { "epoch": 2.38, "grad_norm": 2.1500189304351807, "learning_rate": 1.246368391976927e-06, "loss": 0.166, "step": 11900 }, { "epoch": 2.382, "grad_norm": 1.851035237312317, "learning_rate": 1.2386919773690276e-06, "loss": 0.154, "step": 11910 }, { "epoch": 2.384, "grad_norm": 1.200472116470337, "learning_rate": 1.2310359318176229e-06, "loss": 0.1441, "step": 11920 }, { "epoch": 2.386, "grad_norm": 1.9997992515563965, "learning_rate": 1.2234002967834036e-06, "loss": 0.1886, "step": 11930 }, { "epoch": 2.388, "grad_norm": 2.4099040031433105, "learning_rate": 1.2157851136165243e-06, "loss": 0.1631, "step": 11940 }, { "epoch": 2.39, "grad_norm": 1.5515005588531494, "learning_rate": 1.2081904235563908e-06, "loss": 0.1468, "step": 11950 }, { "epoch": 2.392, "grad_norm": 1.075179100036621, "learning_rate": 1.2006162677314265e-06, "loss": 0.1464, "step": 11960 }, { "epoch": 2.394, "grad_norm": 1.8599796295166016, "learning_rate": 1.1930626871588525e-06, "loss": 0.1865, "step": 11970 }, { "epoch": 2.396, "grad_norm": 1.7627993822097778, "learning_rate": 1.185529722744469e-06, "loss": 0.165, "step": 11980 }, { "epoch": 2.398, "grad_norm": 1.7737939357757568, "learning_rate": 1.1780174152824297e-06, "loss": 0.1545, "step": 11990 }, { "epoch": 2.4, "grad_norm": 0.9698889851570129, "learning_rate": 1.1705258054550212e-06, "loss": 0.1575, "step": 12000 }, { "epoch": 2.402, "grad_norm": 1.0956158638000488, "learning_rate": 1.1630549338324454e-06, "loss": 0.1703, "step": 12010 }, { "epoch": 2.404, "grad_norm": 1.4204548597335815, "learning_rate": 1.155604840872599e-06, "loss": 0.166, "step": 12020 }, { "epoch": 2.406, "grad_norm": 1.6942260265350342, "learning_rate": 1.1481755669208495e-06, "loss": 0.1596, "step": 12030 }, { "epoch": 2.408, "grad_norm": 1.5839786529541016, "learning_rate": 1.1407671522098262e-06, "loss": 0.1931, "step": 12040 }, { "epoch": 2.41, "grad_norm": 2.013857841491699, "learning_rate": 1.1333796368591915e-06, "loss": 0.1791, "step": 12050 }, { "epoch": 2.412, "grad_norm": 1.7067184448242188, "learning_rate": 1.126013060875432e-06, "loss": 0.1849, "step": 12060 }, { "epoch": 2.414, "grad_norm": 1.8866993188858032, "learning_rate": 1.1186674641516415e-06, "loss": 0.1922, "step": 12070 }, { "epoch": 2.416, "grad_norm": 2.4251067638397217, "learning_rate": 1.1113428864672954e-06, "loss": 0.1807, "step": 12080 }, { "epoch": 2.418, "grad_norm": 1.515995740890503, "learning_rate": 1.1040393674880478e-06, "loss": 0.1896, "step": 12090 }, { "epoch": 2.42, "grad_norm": 1.7295435667037964, "learning_rate": 1.0967569467655104e-06, "loss": 0.1473, "step": 12100 }, { "epoch": 2.422, "grad_norm": 1.7556982040405273, "learning_rate": 1.0894956637370363e-06, "loss": 0.1475, "step": 12110 }, { "epoch": 2.424, "grad_norm": 1.9276478290557861, "learning_rate": 1.082255557725511e-06, "loss": 0.1539, "step": 12120 }, { "epoch": 2.426, "grad_norm": 1.5865824222564697, "learning_rate": 1.0750366679391393e-06, "loss": 0.1449, "step": 12130 }, { "epoch": 2.428, "grad_norm": 1.8489198684692383, "learning_rate": 1.0678390334712275e-06, "loss": 0.1547, "step": 12140 }, { "epoch": 2.43, "grad_norm": 1.6484555006027222, "learning_rate": 1.0606626932999775e-06, "loss": 0.1648, "step": 12150 }, { "epoch": 2.432, "grad_norm": 1.7841017246246338, "learning_rate": 1.053507686288276e-06, "loss": 0.1557, "step": 12160 }, { "epoch": 2.434, "grad_norm": 1.6266376972198486, "learning_rate": 1.0463740511834759e-06, "loss": 0.1449, "step": 12170 }, { "epoch": 2.436, "grad_norm": 1.884961724281311, "learning_rate": 1.0392618266171983e-06, "loss": 0.1606, "step": 12180 }, { "epoch": 2.438, "grad_norm": 2.732299566268921, "learning_rate": 1.0321710511051108e-06, "loss": 0.1637, "step": 12190 }, { "epoch": 2.44, "grad_norm": 1.4075685739517212, "learning_rate": 1.0251017630467347e-06, "loss": 0.1798, "step": 12200 }, { "epoch": 2.442, "grad_norm": 1.3892203569412231, "learning_rate": 1.01805400072522e-06, "loss": 0.1867, "step": 12210 }, { "epoch": 2.444, "grad_norm": 1.6456845998764038, "learning_rate": 1.0110278023071445e-06, "loss": 0.1632, "step": 12220 }, { "epoch": 2.446, "grad_norm": 1.5476148128509521, "learning_rate": 1.0040232058423182e-06, "loss": 0.1513, "step": 12230 }, { "epoch": 2.448, "grad_norm": 1.5470598936080933, "learning_rate": 9.970402492635583e-07, "loss": 0.1741, "step": 12240 }, { "epoch": 2.45, "grad_norm": 1.6543699502944946, "learning_rate": 9.900789703864933e-07, "loss": 0.1374, "step": 12250 }, { "epoch": 2.452, "grad_norm": 1.6343821287155151, "learning_rate": 9.831394069093663e-07, "loss": 0.1526, "step": 12260 }, { "epoch": 2.454, "grad_norm": 1.9217232465744019, "learning_rate": 9.762215964128124e-07, "loss": 0.1716, "step": 12270 }, { "epoch": 2.456, "grad_norm": 2.077751874923706, "learning_rate": 9.69325576359672e-07, "loss": 0.1599, "step": 12280 }, { "epoch": 2.458, "grad_norm": 1.3649940490722656, "learning_rate": 9.624513840947764e-07, "loss": 0.1499, "step": 12290 }, { "epoch": 2.46, "grad_norm": 1.173978328704834, "learning_rate": 9.555990568447538e-07, "loss": 0.1685, "step": 12300 }, { "epoch": 2.462, "grad_norm": 1.5035467147827148, "learning_rate": 9.487686317178241e-07, "loss": 0.1905, "step": 12310 }, { "epoch": 2.464, "grad_norm": 1.3181893825531006, "learning_rate": 9.419601457035943e-07, "loss": 0.1994, "step": 12320 }, { "epoch": 2.466, "grad_norm": 1.2825194597244263, "learning_rate": 9.351736356728657e-07, "loss": 0.161, "step": 12330 }, { "epoch": 2.468, "grad_norm": 2.407343626022339, "learning_rate": 9.284091383774313e-07, "loss": 0.1929, "step": 12340 }, { "epoch": 2.4699999999999998, "grad_norm": 1.4245587587356567, "learning_rate": 9.2166669044987e-07, "loss": 0.154, "step": 12350 }, { "epoch": 2.472, "grad_norm": 2.4359230995178223, "learning_rate": 9.149463284033605e-07, "loss": 0.1854, "step": 12360 }, { "epoch": 2.474, "grad_norm": 1.9102237224578857, "learning_rate": 9.08248088631476e-07, "loss": 0.2134, "step": 12370 }, { "epoch": 2.476, "grad_norm": 1.268130898475647, "learning_rate": 9.015720074079837e-07, "loss": 0.2093, "step": 12380 }, { "epoch": 2.4779999999999998, "grad_norm": 1.4418933391571045, "learning_rate": 8.949181208866581e-07, "loss": 0.1573, "step": 12390 }, { "epoch": 2.48, "grad_norm": 1.8565024137496948, "learning_rate": 8.882864651010798e-07, "loss": 0.1778, "step": 12400 }, { "epoch": 2.482, "grad_norm": 1.8131656646728516, "learning_rate": 8.816770759644361e-07, "loss": 0.1609, "step": 12410 }, { "epoch": 2.484, "grad_norm": 1.573459506034851, "learning_rate": 8.750899892693376e-07, "loss": 0.155, "step": 12420 }, { "epoch": 2.4859999999999998, "grad_norm": 1.8090794086456299, "learning_rate": 8.685252406876116e-07, "loss": 0.2023, "step": 12430 }, { "epoch": 2.488, "grad_norm": 1.2365047931671143, "learning_rate": 8.61982865770119e-07, "loss": 0.1746, "step": 12440 }, { "epoch": 2.49, "grad_norm": 1.4933933019638062, "learning_rate": 8.554628999465592e-07, "loss": 0.1726, "step": 12450 }, { "epoch": 2.492, "grad_norm": 1.6978332996368408, "learning_rate": 8.489653785252711e-07, "loss": 0.1853, "step": 12460 }, { "epoch": 2.4939999999999998, "grad_norm": 1.4669007062911987, "learning_rate": 8.424903366930531e-07, "loss": 0.22, "step": 12470 }, { "epoch": 2.496, "grad_norm": 1.660845398902893, "learning_rate": 8.360378095149674e-07, "loss": 0.1646, "step": 12480 }, { "epoch": 2.498, "grad_norm": 1.6666799783706665, "learning_rate": 8.296078319341444e-07, "loss": 0.1526, "step": 12490 }, { "epoch": 2.5, "grad_norm": 1.4847021102905273, "learning_rate": 8.232004387716053e-07, "loss": 0.1516, "step": 12500 }, { "epoch": 2.502, "grad_norm": 1.512980580329895, "learning_rate": 8.16815664726065e-07, "loss": 0.1886, "step": 12510 }, { "epoch": 2.504, "grad_norm": 1.0014886856079102, "learning_rate": 8.104535443737438e-07, "loss": 0.151, "step": 12520 }, { "epoch": 2.5060000000000002, "grad_norm": 1.831697940826416, "learning_rate": 8.041141121681867e-07, "loss": 0.1517, "step": 12530 }, { "epoch": 2.508, "grad_norm": 1.6552419662475586, "learning_rate": 7.977974024400703e-07, "loss": 0.1864, "step": 12540 }, { "epoch": 2.51, "grad_norm": 2.3965301513671875, "learning_rate": 7.91503449397022e-07, "loss": 0.1646, "step": 12550 }, { "epoch": 2.512, "grad_norm": 2.3798716068267822, "learning_rate": 7.852322871234286e-07, "loss": 0.1702, "step": 12560 }, { "epoch": 2.5140000000000002, "grad_norm": 1.1681225299835205, "learning_rate": 7.789839495802581e-07, "loss": 0.1726, "step": 12570 }, { "epoch": 2.516, "grad_norm": 1.7020608186721802, "learning_rate": 7.727584706048735e-07, "loss": 0.1862, "step": 12580 }, { "epoch": 2.518, "grad_norm": 1.7006206512451172, "learning_rate": 7.665558839108467e-07, "loss": 0.1638, "step": 12590 }, { "epoch": 2.52, "grad_norm": 1.530461072921753, "learning_rate": 7.603762230877776e-07, "loss": 0.1511, "step": 12600 }, { "epoch": 2.5220000000000002, "grad_norm": 1.7536654472351074, "learning_rate": 7.542195216011188e-07, "loss": 0.1693, "step": 12610 }, { "epoch": 2.524, "grad_norm": 2.3601443767547607, "learning_rate": 7.480858127919821e-07, "loss": 0.1946, "step": 12620 }, { "epoch": 2.526, "grad_norm": 1.876556158065796, "learning_rate": 7.419751298769667e-07, "loss": 0.1628, "step": 12630 }, { "epoch": 2.528, "grad_norm": 1.4596962928771973, "learning_rate": 7.358875059479792e-07, "loss": 0.1703, "step": 12640 }, { "epoch": 2.5300000000000002, "grad_norm": 1.8861957788467407, "learning_rate": 7.29822973972047e-07, "loss": 0.1865, "step": 12650 }, { "epoch": 2.532, "grad_norm": 1.5087050199508667, "learning_rate": 7.237815667911502e-07, "loss": 0.1665, "step": 12660 }, { "epoch": 2.534, "grad_norm": 1.47129487991333, "learning_rate": 7.177633171220339e-07, "loss": 0.1728, "step": 12670 }, { "epoch": 2.536, "grad_norm": 2.511249542236328, "learning_rate": 7.117682575560386e-07, "loss": 0.2098, "step": 12680 }, { "epoch": 2.5380000000000003, "grad_norm": 1.4766125679016113, "learning_rate": 7.057964205589218e-07, "loss": 0.1697, "step": 12690 }, { "epoch": 2.54, "grad_norm": 2.0010459423065186, "learning_rate": 6.99847838470677e-07, "loss": 0.2231, "step": 12700 }, { "epoch": 2.542, "grad_norm": 2.9428231716156006, "learning_rate": 6.939225435053648e-07, "loss": 0.2108, "step": 12710 }, { "epoch": 2.544, "grad_norm": 2.4707274436950684, "learning_rate": 6.880205677509383e-07, "loss": 0.207, "step": 12720 }, { "epoch": 2.5460000000000003, "grad_norm": 1.7522751092910767, "learning_rate": 6.821419431690629e-07, "loss": 0.1804, "step": 12730 }, { "epoch": 2.548, "grad_norm": 2.5205798149108887, "learning_rate": 6.762867015949514e-07, "loss": 0.2039, "step": 12740 }, { "epoch": 2.55, "grad_norm": 2.847193479537964, "learning_rate": 6.704548747371869e-07, "loss": 0.1945, "step": 12750 }, { "epoch": 2.552, "grad_norm": 1.2266204357147217, "learning_rate": 6.646464941775499e-07, "loss": 0.178, "step": 12760 }, { "epoch": 2.5540000000000003, "grad_norm": 1.5307732820510864, "learning_rate": 6.588615913708524e-07, "loss": 0.1616, "step": 12770 }, { "epoch": 2.556, "grad_norm": 1.3956282138824463, "learning_rate": 6.531001976447637e-07, "loss": 0.1128, "step": 12780 }, { "epoch": 2.558, "grad_norm": 1.526992678642273, "learning_rate": 6.47362344199639e-07, "loss": 0.2029, "step": 12790 }, { "epoch": 2.56, "grad_norm": 1.8423537015914917, "learning_rate": 6.416480621083582e-07, "loss": 0.1969, "step": 12800 }, { "epoch": 2.5620000000000003, "grad_norm": 1.3227084875106812, "learning_rate": 6.359573823161457e-07, "loss": 0.2046, "step": 12810 }, { "epoch": 2.564, "grad_norm": 2.2461440563201904, "learning_rate": 6.302903356404161e-07, "loss": 0.2142, "step": 12820 }, { "epoch": 2.566, "grad_norm": 0.9911752939224243, "learning_rate": 6.246469527705978e-07, "loss": 0.1574, "step": 12830 }, { "epoch": 2.568, "grad_norm": 1.4631303548812866, "learning_rate": 6.190272642679674e-07, "loss": 0.1516, "step": 12840 }, { "epoch": 2.57, "grad_norm": 1.7750544548034668, "learning_rate": 6.134313005654929e-07, "loss": 0.1454, "step": 12850 }, { "epoch": 2.572, "grad_norm": 2.154233455657959, "learning_rate": 6.078590919676575e-07, "loss": 0.1687, "step": 12860 }, { "epoch": 2.574, "grad_norm": 1.879696011543274, "learning_rate": 6.023106686502988e-07, "loss": 0.1999, "step": 12870 }, { "epoch": 2.576, "grad_norm": 1.5651618242263794, "learning_rate": 5.967860606604553e-07, "loss": 0.1647, "step": 12880 }, { "epoch": 2.578, "grad_norm": 1.1055610179901123, "learning_rate": 5.912852979161876e-07, "loss": 0.1461, "step": 12890 }, { "epoch": 2.58, "grad_norm": 1.6748799085617065, "learning_rate": 5.858084102064271e-07, "loss": 0.1708, "step": 12900 }, { "epoch": 2.582, "grad_norm": 1.5193976163864136, "learning_rate": 5.803554271908124e-07, "loss": 0.172, "step": 12910 }, { "epoch": 2.584, "grad_norm": 1.606832504272461, "learning_rate": 5.749263783995279e-07, "loss": 0.1459, "step": 12920 }, { "epoch": 2.586, "grad_norm": 2.012147903442383, "learning_rate": 5.69521293233145e-07, "loss": 0.1558, "step": 12930 }, { "epoch": 2.588, "grad_norm": 1.5900416374206543, "learning_rate": 5.641402009624591e-07, "loss": 0.2086, "step": 12940 }, { "epoch": 2.59, "grad_norm": 2.680753469467163, "learning_rate": 5.587831307283375e-07, "loss": 0.1668, "step": 12950 }, { "epoch": 2.592, "grad_norm": 1.6326476335525513, "learning_rate": 5.534501115415575e-07, "loss": 0.2084, "step": 12960 }, { "epoch": 2.594, "grad_norm": 1.771647572517395, "learning_rate": 5.48141172282648e-07, "loss": 0.1903, "step": 12970 }, { "epoch": 2.596, "grad_norm": 1.6876575946807861, "learning_rate": 5.428563417017335e-07, "loss": 0.1602, "step": 12980 }, { "epoch": 2.598, "grad_norm": 1.6764187812805176, "learning_rate": 5.375956484183875e-07, "loss": 0.2182, "step": 12990 }, { "epoch": 2.6, "grad_norm": 2.2477219104766846, "learning_rate": 5.323591209214612e-07, "loss": 0.1858, "step": 13000 }, { "epoch": 2.602, "grad_norm": 1.2577763795852661, "learning_rate": 5.271467875689429e-07, "loss": 0.1794, "step": 13010 }, { "epoch": 2.604, "grad_norm": 2.3537323474884033, "learning_rate": 5.219586765877998e-07, "loss": 0.1936, "step": 13020 }, { "epoch": 2.606, "grad_norm": 2.4193389415740967, "learning_rate": 5.167948160738206e-07, "loss": 0.193, "step": 13030 }, { "epoch": 2.608, "grad_norm": 1.678281545639038, "learning_rate": 5.116552339914726e-07, "loss": 0.1599, "step": 13040 }, { "epoch": 2.61, "grad_norm": 2.4545416831970215, "learning_rate": 5.065399581737412e-07, "loss": 0.2522, "step": 13050 }, { "epoch": 2.612, "grad_norm": 1.6324633359909058, "learning_rate": 5.014490163219854e-07, "loss": 0.1548, "step": 13060 }, { "epoch": 2.614, "grad_norm": 1.7150189876556396, "learning_rate": 4.963824360057868e-07, "loss": 0.1563, "step": 13070 }, { "epoch": 2.616, "grad_norm": 1.821446418762207, "learning_rate": 4.913402446627946e-07, "loss": 0.1644, "step": 13080 }, { "epoch": 2.618, "grad_norm": 1.215470552444458, "learning_rate": 4.863224695985858e-07, "loss": 0.1826, "step": 13090 }, { "epoch": 2.62, "grad_norm": 1.6227540969848633, "learning_rate": 4.813291379865126e-07, "loss": 0.2281, "step": 13100 }, { "epoch": 2.622, "grad_norm": 2.389894723892212, "learning_rate": 4.763602768675529e-07, "loss": 0.1783, "step": 13110 }, { "epoch": 2.624, "grad_norm": 2.125441312789917, "learning_rate": 4.714159131501689e-07, "loss": 0.2042, "step": 13120 }, { "epoch": 2.626, "grad_norm": 1.7118359804153442, "learning_rate": 4.664960736101598e-07, "loss": 0.1868, "step": 13130 }, { "epoch": 2.628, "grad_norm": 2.2027170658111572, "learning_rate": 4.61600784890513e-07, "loss": 0.1808, "step": 13140 }, { "epoch": 2.63, "grad_norm": 1.8995624780654907, "learning_rate": 4.567300735012653e-07, "loss": 0.1864, "step": 13150 }, { "epoch": 2.632, "grad_norm": 1.7229032516479492, "learning_rate": 4.5188396581935856e-07, "loss": 0.1915, "step": 13160 }, { "epoch": 2.634, "grad_norm": 2.252855062484741, "learning_rate": 4.470624880884905e-07, "loss": 0.2062, "step": 13170 }, { "epoch": 2.636, "grad_norm": 1.9822123050689697, "learning_rate": 4.4226566641898173e-07, "loss": 0.1733, "step": 13180 }, { "epoch": 2.638, "grad_norm": 1.441658854484558, "learning_rate": 4.37493526787629e-07, "loss": 0.1911, "step": 13190 }, { "epoch": 2.64, "grad_norm": 1.633154273033142, "learning_rate": 4.327460950375623e-07, "loss": 0.1701, "step": 13200 }, { "epoch": 2.642, "grad_norm": 1.7371437549591064, "learning_rate": 4.280233968781139e-07, "loss": 0.1534, "step": 13210 }, { "epoch": 2.644, "grad_norm": 2.687547445297241, "learning_rate": 4.233254578846657e-07, "loss": 0.1954, "step": 13220 }, { "epoch": 2.646, "grad_norm": 1.7198002338409424, "learning_rate": 4.186523034985279e-07, "loss": 0.172, "step": 13230 }, { "epoch": 2.648, "grad_norm": 1.4830220937728882, "learning_rate": 4.140039590267836e-07, "loss": 0.2197, "step": 13240 }, { "epoch": 2.65, "grad_norm": 2.4155113697052, "learning_rate": 4.0938044964216164e-07, "loss": 0.1562, "step": 13250 }, { "epoch": 2.652, "grad_norm": 2.1205966472625732, "learning_rate": 4.0478180038290296e-07, "loss": 0.1917, "step": 13260 }, { "epoch": 2.654, "grad_norm": 1.5603994131088257, "learning_rate": 4.002080361526156e-07, "loss": 0.1492, "step": 13270 }, { "epoch": 2.656, "grad_norm": 2.272676467895508, "learning_rate": 3.9565918172014495e-07, "loss": 0.1571, "step": 13280 }, { "epoch": 2.658, "grad_norm": 2.0802717208862305, "learning_rate": 3.9113526171944195e-07, "loss": 0.1503, "step": 13290 }, { "epoch": 2.66, "grad_norm": 1.6078263521194458, "learning_rate": 3.866363006494256e-07, "loss": 0.1724, "step": 13300 }, { "epoch": 2.662, "grad_norm": 2.419299364089966, "learning_rate": 3.821623228738536e-07, "loss": 0.188, "step": 13310 }, { "epoch": 2.664, "grad_norm": 2.157850742340088, "learning_rate": 3.777133526211857e-07, "loss": 0.1876, "step": 13320 }, { "epoch": 2.666, "grad_norm": 2.2567625045776367, "learning_rate": 3.732894139844578e-07, "loss": 0.1865, "step": 13330 }, { "epoch": 2.668, "grad_norm": 1.955741047859192, "learning_rate": 3.688905309211488e-07, "loss": 0.1781, "step": 13340 }, { "epoch": 2.67, "grad_norm": 1.4101120233535767, "learning_rate": 3.6451672725304974e-07, "loss": 0.1871, "step": 13350 }, { "epoch": 2.672, "grad_norm": 2.0826516151428223, "learning_rate": 3.601680266661367e-07, "loss": 0.1913, "step": 13360 }, { "epoch": 2.674, "grad_norm": 2.287477493286133, "learning_rate": 3.5584445271044544e-07, "loss": 0.2232, "step": 13370 }, { "epoch": 2.676, "grad_norm": 2.551602602005005, "learning_rate": 3.515460287999345e-07, "loss": 0.1898, "step": 13380 }, { "epoch": 2.678, "grad_norm": 1.6702704429626465, "learning_rate": 3.472727782123697e-07, "loss": 0.1873, "step": 13390 }, { "epoch": 2.68, "grad_norm": 1.998603105545044, "learning_rate": 3.430247240891904e-07, "loss": 0.2089, "step": 13400 }, { "epoch": 2.682, "grad_norm": 2.5154683589935303, "learning_rate": 3.3880188943538617e-07, "loss": 0.1813, "step": 13410 }, { "epoch": 2.684, "grad_norm": 1.4025160074234009, "learning_rate": 3.3460429711937417e-07, "loss": 0.1896, "step": 13420 }, { "epoch": 2.686, "grad_norm": 1.7359564304351807, "learning_rate": 3.304319698728714e-07, "loss": 0.1729, "step": 13430 }, { "epoch": 2.6879999999999997, "grad_norm": 1.8465240001678467, "learning_rate": 3.262849302907767e-07, "loss": 0.1519, "step": 13440 }, { "epoch": 2.69, "grad_norm": 1.9284135103225708, "learning_rate": 3.2216320083104434e-07, "loss": 0.2169, "step": 13450 }, { "epoch": 2.692, "grad_norm": 1.5323783159255981, "learning_rate": 3.180668038145629e-07, "loss": 0.1498, "step": 13460 }, { "epoch": 2.694, "grad_norm": 1.5375773906707764, "learning_rate": 3.1399576142503606e-07, "loss": 0.2163, "step": 13470 }, { "epoch": 2.6959999999999997, "grad_norm": 2.2655889987945557, "learning_rate": 3.0995009570886305e-07, "loss": 0.1614, "step": 13480 }, { "epoch": 2.698, "grad_norm": 2.173973560333252, "learning_rate": 3.05929828575014e-07, "loss": 0.155, "step": 13490 }, { "epoch": 2.7, "grad_norm": 1.8570374250411987, "learning_rate": 3.01934981794918e-07, "loss": 0.1896, "step": 13500 }, { "epoch": 2.702, "grad_norm": 1.7890677452087402, "learning_rate": 2.9796557700234317e-07, "loss": 0.1941, "step": 13510 }, { "epoch": 2.7039999999999997, "grad_norm": 1.8128156661987305, "learning_rate": 2.940216356932746e-07, "loss": 0.2079, "step": 13520 }, { "epoch": 2.706, "grad_norm": 1.5485886335372925, "learning_rate": 2.901031792258058e-07, "loss": 0.1603, "step": 13530 }, { "epoch": 2.708, "grad_norm": 1.4785550832748413, "learning_rate": 2.862102288200186e-07, "loss": 0.2021, "step": 13540 }, { "epoch": 2.71, "grad_norm": 2.2677414417266846, "learning_rate": 2.823428055578664e-07, "loss": 0.1853, "step": 13550 }, { "epoch": 2.7119999999999997, "grad_norm": 1.6280418634414673, "learning_rate": 2.7850093038306493e-07, "loss": 0.164, "step": 13560 }, { "epoch": 2.714, "grad_norm": 1.7581329345703125, "learning_rate": 2.746846241009765e-07, "loss": 0.2056, "step": 13570 }, { "epoch": 2.716, "grad_norm": 1.453389286994934, "learning_rate": 2.7089390737849386e-07, "loss": 0.1762, "step": 13580 }, { "epoch": 2.718, "grad_norm": 1.141204595565796, "learning_rate": 2.67128800743936e-07, "loss": 0.2031, "step": 13590 }, { "epoch": 2.7199999999999998, "grad_norm": 1.8374614715576172, "learning_rate": 2.6338932458692847e-07, "loss": 0.1961, "step": 13600 }, { "epoch": 2.722, "grad_norm": 1.7040131092071533, "learning_rate": 2.596754991583017e-07, "loss": 0.1614, "step": 13610 }, { "epoch": 2.724, "grad_norm": 1.686630368232727, "learning_rate": 2.5598734456997287e-07, "loss": 0.1527, "step": 13620 }, { "epoch": 2.726, "grad_norm": 1.6243407726287842, "learning_rate": 2.523248807948403e-07, "loss": 0.164, "step": 13630 }, { "epoch": 2.7279999999999998, "grad_norm": 2.1291511058807373, "learning_rate": 2.486881276666808e-07, "loss": 0.1484, "step": 13640 }, { "epoch": 2.73, "grad_norm": 1.8856449127197266, "learning_rate": 2.4507710488003155e-07, "loss": 0.2218, "step": 13650 }, { "epoch": 2.732, "grad_norm": 1.4211268424987793, "learning_rate": 2.414918319900922e-07, "loss": 0.1666, "step": 13660 }, { "epoch": 2.734, "grad_norm": 1.9689491987228394, "learning_rate": 2.379323284126156e-07, "loss": 0.2322, "step": 13670 }, { "epoch": 2.7359999999999998, "grad_norm": 1.373671054840088, "learning_rate": 2.3439861342380243e-07, "loss": 0.1145, "step": 13680 }, { "epoch": 2.738, "grad_norm": 1.6998653411865234, "learning_rate": 2.3089070616019838e-07, "loss": 0.1484, "step": 13690 }, { "epoch": 2.74, "grad_norm": 2.0054614543914795, "learning_rate": 2.2740862561858812e-07, "loss": 0.1963, "step": 13700 }, { "epoch": 2.742, "grad_norm": 2.4972550868988037, "learning_rate": 2.2395239065589436e-07, "loss": 0.1747, "step": 13710 }, { "epoch": 2.7439999999999998, "grad_norm": 1.2388681173324585, "learning_rate": 2.2052201998907673e-07, "loss": 0.1661, "step": 13720 }, { "epoch": 2.746, "grad_norm": 1.4141393899917603, "learning_rate": 2.1711753219502584e-07, "loss": 0.1864, "step": 13730 }, { "epoch": 2.748, "grad_norm": 1.2588516473770142, "learning_rate": 2.1373894571046772e-07, "loss": 0.1865, "step": 13740 }, { "epoch": 2.75, "grad_norm": 2.5091469287872314, "learning_rate": 2.103862788318628e-07, "loss": 0.1887, "step": 13750 }, { "epoch": 2.752, "grad_norm": 1.595197081565857, "learning_rate": 2.0705954971530385e-07, "loss": 0.1678, "step": 13760 }, { "epoch": 2.754, "grad_norm": 2.847097396850586, "learning_rate": 2.0375877637642038e-07, "loss": 0.1733, "step": 13770 }, { "epoch": 2.7560000000000002, "grad_norm": 1.9094551801681519, "learning_rate": 2.0048397669028164e-07, "loss": 0.1525, "step": 13780 }, { "epoch": 2.758, "grad_norm": 1.487987995147705, "learning_rate": 1.9723516839129765e-07, "loss": 0.1754, "step": 13790 }, { "epoch": 2.76, "grad_norm": 2.1492459774017334, "learning_rate": 1.9401236907312437e-07, "loss": 0.1516, "step": 13800 }, { "epoch": 2.762, "grad_norm": 1.7400575876235962, "learning_rate": 1.9081559618856938e-07, "loss": 0.1693, "step": 13810 }, { "epoch": 2.7640000000000002, "grad_norm": 1.5171923637390137, "learning_rate": 1.8764486704949402e-07, "loss": 0.1957, "step": 13820 }, { "epoch": 2.766, "grad_norm": 2.1869702339172363, "learning_rate": 1.8450019882672366e-07, "loss": 0.1919, "step": 13830 }, { "epoch": 2.768, "grad_norm": 1.4110556840896606, "learning_rate": 1.8138160854995145e-07, "loss": 0.195, "step": 13840 }, { "epoch": 2.77, "grad_norm": 1.9794977903366089, "learning_rate": 1.7828911310764974e-07, "loss": 0.1523, "step": 13850 }, { "epoch": 2.7720000000000002, "grad_norm": 1.8616331815719604, "learning_rate": 1.7522272924697438e-07, "loss": 0.1648, "step": 13860 }, { "epoch": 2.774, "grad_norm": 1.7763707637786865, "learning_rate": 1.7218247357367656e-07, "loss": 0.1844, "step": 13870 }, { "epoch": 2.776, "grad_norm": 1.4819947481155396, "learning_rate": 1.6916836255201297e-07, "loss": 0.1744, "step": 13880 }, { "epoch": 2.778, "grad_norm": 1.3867019414901733, "learning_rate": 1.6618041250465623e-07, "loss": 0.1908, "step": 13890 }, { "epoch": 2.7800000000000002, "grad_norm": 2.0330920219421387, "learning_rate": 1.6321863961260452e-07, "loss": 0.1612, "step": 13900 }, { "epoch": 2.782, "grad_norm": 2.5724823474884033, "learning_rate": 1.602830599150984e-07, "loss": 0.2228, "step": 13910 }, { "epoch": 2.784, "grad_norm": 1.4262503385543823, "learning_rate": 1.573736893095301e-07, "loss": 0.1397, "step": 13920 }, { "epoch": 2.786, "grad_norm": 1.5158369541168213, "learning_rate": 1.5449054355135718e-07, "loss": 0.1641, "step": 13930 }, { "epoch": 2.7880000000000003, "grad_norm": 1.3711594343185425, "learning_rate": 1.5163363825402121e-07, "loss": 0.213, "step": 13940 }, { "epoch": 2.79, "grad_norm": 1.3406068086624146, "learning_rate": 1.488029888888598e-07, "loss": 0.1991, "step": 13950 }, { "epoch": 2.792, "grad_norm": 1.1743435859680176, "learning_rate": 1.459986107850231e-07, "loss": 0.1848, "step": 13960 }, { "epoch": 2.794, "grad_norm": 1.5763418674468994, "learning_rate": 1.4322051912939173e-07, "loss": 0.166, "step": 13970 }, { "epoch": 2.7960000000000003, "grad_norm": 1.492067813873291, "learning_rate": 1.40468728966493e-07, "loss": 0.1746, "step": 13980 }, { "epoch": 2.798, "grad_norm": 1.7857205867767334, "learning_rate": 1.3774325519842423e-07, "loss": 0.2087, "step": 13990 }, { "epoch": 2.8, "grad_norm": 2.416992425918579, "learning_rate": 1.3504411258476514e-07, "loss": 0.1844, "step": 14000 }, { "epoch": 2.802, "grad_norm": 2.181525945663452, "learning_rate": 1.323713157425005e-07, "loss": 0.1949, "step": 14010 }, { "epoch": 2.8040000000000003, "grad_norm": 1.5728259086608887, "learning_rate": 1.29724879145946e-07, "loss": 0.1979, "step": 14020 }, { "epoch": 2.806, "grad_norm": 1.9201794862747192, "learning_rate": 1.2710481712666144e-07, "loss": 0.1512, "step": 14030 }, { "epoch": 2.808, "grad_norm": 0.9020956158638, "learning_rate": 1.245111438733798e-07, "loss": 0.1687, "step": 14040 }, { "epoch": 2.81, "grad_norm": 1.2932790517807007, "learning_rate": 1.2194387343192504e-07, "loss": 0.1652, "step": 14050 }, { "epoch": 2.8120000000000003, "grad_norm": 1.8155263662338257, "learning_rate": 1.194030197051421e-07, "loss": 0.1753, "step": 14060 }, { "epoch": 2.814, "grad_norm": 2.2060601711273193, "learning_rate": 1.1688859645281659e-07, "loss": 0.1405, "step": 14070 }, { "epoch": 2.816, "grad_norm": 1.731437087059021, "learning_rate": 1.1440061729160235e-07, "loss": 0.1529, "step": 14080 }, { "epoch": 2.818, "grad_norm": 2.3887579441070557, "learning_rate": 1.1193909569494676e-07, "loss": 0.2014, "step": 14090 }, { "epoch": 2.82, "grad_norm": 1.5371918678283691, "learning_rate": 1.0950404499302015e-07, "loss": 0.1534, "step": 14100 }, { "epoch": 2.822, "grad_norm": 1.737833857536316, "learning_rate": 1.0709547837263967e-07, "loss": 0.1727, "step": 14110 }, { "epoch": 2.824, "grad_norm": 1.5639506578445435, "learning_rate": 1.0471340887720171e-07, "loss": 0.1899, "step": 14120 }, { "epoch": 2.826, "grad_norm": 1.4468129873275757, "learning_rate": 1.0235784940660965e-07, "loss": 0.1617, "step": 14130 }, { "epoch": 2.828, "grad_norm": 1.7118704319000244, "learning_rate": 1.0002881271720222e-07, "loss": 0.1679, "step": 14140 }, { "epoch": 2.83, "grad_norm": 1.9047939777374268, "learning_rate": 9.772631142168864e-08, "loss": 0.196, "step": 14150 }, { "epoch": 2.832, "grad_norm": 2.239691734313965, "learning_rate": 9.545035798907642e-08, "loss": 0.1941, "step": 14160 }, { "epoch": 2.834, "grad_norm": 2.0404582023620605, "learning_rate": 9.320096474460527e-08, "loss": 0.1634, "step": 14170 }, { "epoch": 2.836, "grad_norm": 2.4190192222595215, "learning_rate": 9.097814386968052e-08, "loss": 0.2195, "step": 14180 }, { "epoch": 2.838, "grad_norm": 2.054330348968506, "learning_rate": 8.878190740180759e-08, "loss": 0.1953, "step": 14190 }, { "epoch": 2.84, "grad_norm": 2.02219557762146, "learning_rate": 8.661226723452542e-08, "loss": 0.1761, "step": 14200 }, { "epoch": 2.842, "grad_norm": 1.775745153427124, "learning_rate": 8.446923511734317e-08, "loss": 0.1449, "step": 14210 }, { "epoch": 2.844, "grad_norm": 1.5100916624069214, "learning_rate": 8.235282265567635e-08, "loss": 0.1972, "step": 14220 }, { "epoch": 2.846, "grad_norm": 2.3685286045074463, "learning_rate": 8.02630413107841e-08, "loss": 0.1673, "step": 14230 }, { "epoch": 2.848, "grad_norm": 1.453798770904541, "learning_rate": 7.819990239970654e-08, "loss": 0.1724, "step": 14240 }, { "epoch": 2.85, "grad_norm": 1.8765348196029663, "learning_rate": 7.616341709520359e-08, "loss": 0.1887, "step": 14250 }, { "epoch": 2.852, "grad_norm": 1.9411275386810303, "learning_rate": 7.415359642569564e-08, "loss": 0.1774, "step": 14260 }, { "epoch": 2.854, "grad_norm": 1.686895489692688, "learning_rate": 7.21704512752025e-08, "loss": 0.1713, "step": 14270 }, { "epoch": 2.856, "grad_norm": 1.6647790670394897, "learning_rate": 7.021399238328452e-08, "loss": 0.1696, "step": 14280 }, { "epoch": 2.858, "grad_norm": 2.324164390563965, "learning_rate": 6.828423034498488e-08, "loss": 0.1702, "step": 14290 }, { "epoch": 2.86, "grad_norm": 1.610374093055725, "learning_rate": 6.638117561077295e-08, "loss": 0.1636, "step": 14300 }, { "epoch": 2.862, "grad_norm": 1.3403874635696411, "learning_rate": 6.450483848648547e-08, "loss": 0.1701, "step": 14310 }, { "epoch": 2.864, "grad_norm": 2.045170545578003, "learning_rate": 6.265522913327326e-08, "loss": 0.1838, "step": 14320 }, { "epoch": 2.866, "grad_norm": 1.726438045501709, "learning_rate": 6.083235756754513e-08, "loss": 0.1493, "step": 14330 }, { "epoch": 2.868, "grad_norm": 1.6481523513793945, "learning_rate": 5.903623366091349e-08, "loss": 0.1588, "step": 14340 }, { "epoch": 2.87, "grad_norm": 1.7567453384399414, "learning_rate": 5.726686714013996e-08, "loss": 0.156, "step": 14350 }, { "epoch": 2.872, "grad_norm": 1.5608904361724854, "learning_rate": 5.552426758708429e-08, "loss": 0.212, "step": 14360 }, { "epoch": 2.874, "grad_norm": 1.747316837310791, "learning_rate": 5.380844443865274e-08, "loss": 0.1706, "step": 14370 }, { "epoch": 2.876, "grad_norm": 2.100895881652832, "learning_rate": 5.2119406986745336e-08, "loss": 0.168, "step": 14380 }, { "epoch": 2.878, "grad_norm": 1.1627360582351685, "learning_rate": 5.0457164378205914e-08, "loss": 0.1368, "step": 14390 }, { "epoch": 2.88, "grad_norm": 1.7362346649169922, "learning_rate": 4.882172561477438e-08, "loss": 0.1627, "step": 14400 }, { "epoch": 2.882, "grad_norm": 2.2970685958862305, "learning_rate": 4.7213099553035655e-08, "loss": 0.1751, "step": 14410 }, { "epoch": 2.884, "grad_norm": 2.3094658851623535, "learning_rate": 4.563129490437246e-08, "loss": 0.2271, "step": 14420 }, { "epoch": 2.886, "grad_norm": 1.8514689207077026, "learning_rate": 4.4076320234919254e-08, "loss": 0.162, "step": 14430 }, { "epoch": 2.888, "grad_norm": 1.9428884983062744, "learning_rate": 4.2548183965513415e-08, "loss": 0.2024, "step": 14440 }, { "epoch": 2.89, "grad_norm": 1.88101327419281, "learning_rate": 4.104689437165354e-08, "loss": 0.162, "step": 14450 }, { "epoch": 2.892, "grad_norm": 3.1377224922180176, "learning_rate": 3.957245958345013e-08, "loss": 0.1987, "step": 14460 }, { "epoch": 2.894, "grad_norm": 1.4738689661026, "learning_rate": 3.812488758558386e-08, "loss": 0.1797, "step": 14470 }, { "epoch": 2.896, "grad_norm": 1.6967921257019043, "learning_rate": 3.6704186217263457e-08, "loss": 0.1748, "step": 14480 }, { "epoch": 2.898, "grad_norm": 1.5632615089416504, "learning_rate": 3.531036317218128e-08, "loss": 0.1959, "step": 14490 }, { "epoch": 2.9, "grad_norm": 1.3949593305587769, "learning_rate": 3.394342599847111e-08, "loss": 0.1612, "step": 14500 }, { "epoch": 2.902, "grad_norm": 1.8362523317337036, "learning_rate": 3.260338209867153e-08, "loss": 0.1713, "step": 14510 }, { "epoch": 2.904, "grad_norm": 1.7234903573989868, "learning_rate": 3.129023872967874e-08, "loss": 0.1514, "step": 14520 }, { "epoch": 2.906, "grad_norm": 1.4591097831726074, "learning_rate": 3.0004003002714886e-08, "loss": 0.1996, "step": 14530 }, { "epoch": 2.908, "grad_norm": 2.5706465244293213, "learning_rate": 2.8744681883284274e-08, "loss": 0.1624, "step": 14540 }, { "epoch": 2.91, "grad_norm": 1.6069506406784058, "learning_rate": 2.7512282191136663e-08, "loss": 0.1696, "step": 14550 }, { "epoch": 2.912, "grad_norm": 1.6182852983474731, "learning_rate": 2.6306810600233435e-08, "loss": 0.1736, "step": 14560 }, { "epoch": 2.914, "grad_norm": 1.7584065198898315, "learning_rate": 2.5128273638706513e-08, "loss": 0.1879, "step": 14570 }, { "epoch": 2.916, "grad_norm": 2.2820541858673096, "learning_rate": 2.3976677688827276e-08, "loss": 0.194, "step": 14580 }, { "epoch": 2.918, "grad_norm": 1.8409901857376099, "learning_rate": 2.285202898696881e-08, "loss": 0.208, "step": 14590 }, { "epoch": 2.92, "grad_norm": 2.1918840408325195, "learning_rate": 2.175433362357482e-08, "loss": 0.1869, "step": 14600 }, { "epoch": 2.922, "grad_norm": 1.8916000127792358, "learning_rate": 2.0683597543124655e-08, "loss": 0.1531, "step": 14610 }, { "epoch": 2.924, "grad_norm": 1.347509741783142, "learning_rate": 1.963982654410279e-08, "loss": 0.1879, "step": 14620 }, { "epoch": 2.926, "grad_norm": 1.712788701057434, "learning_rate": 1.862302627896495e-08, "loss": 0.1711, "step": 14630 }, { "epoch": 2.928, "grad_norm": 2.293809175491333, "learning_rate": 1.763320225411036e-08, "loss": 0.2065, "step": 14640 }, { "epoch": 2.93, "grad_norm": 2.585679531097412, "learning_rate": 1.6670359829850657e-08, "loss": 0.2012, "step": 14650 }, { "epoch": 2.932, "grad_norm": 2.2906439304351807, "learning_rate": 1.573450422037992e-08, "loss": 0.1803, "step": 14660 }, { "epoch": 2.934, "grad_norm": 1.0971269607543945, "learning_rate": 1.4825640493747462e-08, "loss": 0.1916, "step": 14670 }, { "epoch": 2.936, "grad_norm": 1.7314507961273193, "learning_rate": 1.3943773571831188e-08, "loss": 0.205, "step": 14680 }, { "epoch": 2.9379999999999997, "grad_norm": 1.9683465957641602, "learning_rate": 1.3088908230308728e-08, "loss": 0.1355, "step": 14690 }, { "epoch": 2.94, "grad_norm": 2.5472261905670166, "learning_rate": 1.2261049098634125e-08, "loss": 0.1713, "step": 14700 }, { "epoch": 2.942, "grad_norm": 1.2666493654251099, "learning_rate": 1.146020066001119e-08, "loss": 0.1523, "step": 14710 }, { "epoch": 2.944, "grad_norm": 1.82552170753479, "learning_rate": 1.0686367251368512e-08, "loss": 0.1705, "step": 14720 }, { "epoch": 2.9459999999999997, "grad_norm": 1.3673129081726074, "learning_rate": 9.939553063337826e-09, "loss": 0.1737, "step": 14730 }, { "epoch": 2.948, "grad_norm": 2.0833985805511475, "learning_rate": 9.219762140231237e-09, "loss": 0.1701, "step": 14740 }, { "epoch": 2.95, "grad_norm": 1.2477442026138306, "learning_rate": 8.526998380016804e-09, "loss": 0.1864, "step": 14750 }, { "epoch": 2.952, "grad_norm": 2.231452226638794, "learning_rate": 7.861265534300222e-09, "loss": 0.1928, "step": 14760 }, { "epoch": 2.9539999999999997, "grad_norm": 1.7045234441757202, "learning_rate": 7.222567208303721e-09, "loss": 0.1471, "step": 14770 }, { "epoch": 2.956, "grad_norm": 1.1955382823944092, "learning_rate": 6.610906860845534e-09, "loss": 0.1582, "step": 14780 }, { "epoch": 2.958, "grad_norm": 1.9184893369674683, "learning_rate": 6.026287804322683e-09, "loss": 0.2123, "step": 14790 }, { "epoch": 2.96, "grad_norm": 2.046898126602173, "learning_rate": 5.468713204692111e-09, "loss": 0.1694, "step": 14800 }, { "epoch": 2.9619999999999997, "grad_norm": 1.4556955099105835, "learning_rate": 4.9381860814529115e-09, "loss": 0.1834, "step": 14810 }, { "epoch": 2.964, "grad_norm": 1.5225237607955933, "learning_rate": 4.434709307631901e-09, "loss": 0.1509, "step": 14820 }, { "epoch": 2.966, "grad_norm": 2.2594845294952393, "learning_rate": 3.9582856097658554e-09, "loss": 0.1882, "step": 14830 }, { "epoch": 2.968, "grad_norm": 1.5588994026184082, "learning_rate": 3.5089175678892916e-09, "loss": 0.203, "step": 14840 }, { "epoch": 2.9699999999999998, "grad_norm": 2.009692668914795, "learning_rate": 3.0866076155172677e-09, "loss": 0.1684, "step": 14850 }, { "epoch": 2.972, "grad_norm": 1.583719253540039, "learning_rate": 2.6913580396359384e-09, "loss": 0.1863, "step": 14860 }, { "epoch": 2.974, "grad_norm": 1.9162685871124268, "learning_rate": 2.323170980685907e-09, "loss": 0.1777, "step": 14870 }, { "epoch": 2.976, "grad_norm": 1.2543405294418335, "learning_rate": 1.982048432554451e-09, "loss": 0.1686, "step": 14880 }, { "epoch": 2.9779999999999998, "grad_norm": 2.3343191146850586, "learning_rate": 1.6679922425638651e-09, "loss": 0.2004, "step": 14890 }, { "epoch": 2.98, "grad_norm": 2.2958972454071045, "learning_rate": 1.3810041114581397e-09, "loss": 0.2172, "step": 14900 }, { "epoch": 2.982, "grad_norm": 1.3667978048324585, "learning_rate": 1.1210855933996289e-09, "loss": 0.1731, "step": 14910 }, { "epoch": 2.984, "grad_norm": 2.113403558731079, "learning_rate": 8.88238095955174e-10, "loss": 0.1527, "step": 14920 }, { "epoch": 2.9859999999999998, "grad_norm": 2.218931198120117, "learning_rate": 6.824628800911059e-10, "loss": 0.1829, "step": 14930 }, { "epoch": 2.988, "grad_norm": 1.7202938795089722, "learning_rate": 5.037610601665855e-10, "loss": 0.148, "step": 14940 }, { "epoch": 2.99, "grad_norm": 1.6666905879974365, "learning_rate": 3.521336039263856e-10, "loss": 0.1999, "step": 14950 }, { "epoch": 2.992, "grad_norm": 1.587571620941162, "learning_rate": 2.2758133249756175e-10, "loss": 0.14, "step": 14960 }, { "epoch": 2.9939999999999998, "grad_norm": 1.2970430850982666, "learning_rate": 1.3010492038223465e-10, "loss": 0.1418, "step": 14970 }, { "epoch": 2.996, "grad_norm": 1.8305574655532837, "learning_rate": 5.970489545537028e-11, "loss": 0.1642, "step": 14980 }, { "epoch": 2.998, "grad_norm": 1.9229685068130493, "learning_rate": 1.63816389631144e-11, "loss": 0.1686, "step": 14990 }, { "epoch": 3.0, "grad_norm": 1.2971068620681763, "learning_rate": 1.3538551946190582e-13, "loss": 0.1871, "step": 15000 } ], "logging_steps": 10, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3591424743807386e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }